def parse(instr, maxrecs, progress): """return python DOM object given FPXML""" # parsing footprint format is the identity operation if progress: print datetime.now(), "parse_footprint: parsing ", len(instr), " bytes." xmldoc = xmlh.simple_parser(instr, KNOWN_ELEMENTS, progress) if progress: print datetime.now(), "parse_footprint: done parsing." return xmldoc
def parse(instr, maxrecs, progress): """return python DOM object given FPXML""" # parsing footprint format is the identity operation if progress: print datetime.now(), "parse_footprint: parsing ", len( instr), " bytes." xmldoc = xmlh.simple_parser(instr, KNOWN_ELEMENTS, progress) if progress: print datetime.now(), "parse_footprint: done parsing." return xmldoc
def parse(instr, maxrecs, progress): numorgs = numopps = 0 instr = re.sub(r'<(/?db):', r'<\1_', instr) opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>', instr, re.DOTALL) volopps = "" for i, oppstr in enumerate(opps): #if progress and i > 0 and i % 250 == 0: # print str(datetime.now())+": ", i, " opportunities processed." if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(oppstr, known_elnames, progress=False) orgid = register_org(item) # logoURL -- sigh, this is for the opportunity not the org volopps += '<VolunteerOpportunity>' volopps += xmlh.output_val('volunteerOpportunityID', str(i)) volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid)) volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID") volopps += xmlh.output_node('title', item, "Title") volopps += xmlh.output_node('abstract', item, "Description") volopps += xmlh.output_node('description', item, "Description") volopps += xmlh.output_node('detailURL', item, "DetailURL") volopps += xmlh.output_val('volunteersNeeded', "-8888") try: oppdates = item.getElementsByTagName("OpportunityDate") except: oppdates = [] if len(oppdates) > 1: print datetime.now(), \ "parse_servenet.py: only 1 OpportunityDate supported." #return None oppdate = oppdates[0] elif len(oppdates) == 0: oppdate = None else: oppdate = oppdates[0] volopps += '<dateTimeDurations><dateTimeDuration>' if oppdate: volopps += xmlh.output_val('openEnded', 'No') volopps += xmlh.output_val( 'duration', 'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"), xmlh.get_tag_val(oppdate, "DurationUnit"))) volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += xmlh.output_node('startDate', oppdate, "StartDate") volopps += xmlh.output_node('endDate', oppdate, "EndDate") else: volopps += xmlh.output_val('openEnded', 'Yes') volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += '</dateTimeDuration></dateTimeDurations>' volopps += '<locations>' try: opplocs = item.getElementsByTagName("Location") except: opplocs = [] for opploc in opplocs: volopps += '<location>' virtual_tag = opploc.getElementsByTagName("Virtual") if virtual_tag and xmlh.get_tag_val( opploc, "Virtual").lower() == "yes": volopps += xmlh.output_val('virtual', 'Yes') else: volopps += xmlh.output_node('region', opploc, "StateOrProvince") volopps += xmlh.output_node('country', opploc, "Country") volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode") volopps += '</location>' volopps += '</locations>' volopps += '<categoryTags/>' volopps += '</VolunteerOpportunity>' numopps += 1 # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', providerID) outstr += xmlh.output_val('providerName', providerName) outstr += xmlh.output_val('feedID', feedID) outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', providerURL) outstr += xmlh.output_val('description', feedDescription) # TODO: capture ts -- use now?! outstr += '</FeedInfo>' # hardcoded: Organization outstr += '<Organizations>' for key in ORGS: outstr += ORGS[key] numorgs += 1 outstr += '</Organizations>' outstr += '<VolunteerOpportunities>' outstr += volopps outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
def parse(s, maxrecs, progress): """return FPXML given volunteermatch data""" # TODO: progress known_elnames = ['feed', 'title', 'subtitle', 'div', 'span', 'updated', 'id', 'link', 'icon', 'logo', 'author', 'name', 'uri', 'email', 'rights', 'entry', 'published', 'g:publish_date', 'g:expiration_date', 'g:event_date_range', 'g:start', 'g:end', 'updated', 'category', 'summary', 'content', 'awb:city', 'awb:country', 'awb:state', 'awb:postalcode', 'g:location', 'g:age_range', 'g:employer', 'g:job_type', 'g:job_industry', 'awb:paid', ] xmldoc = xmlh.simple_parser(s, known_elnames, progress) pubdate = xmlh.get_tag_val(xmldoc, "created") ts = dateutil.parser.parse(pubdate) pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S") # convert to footprint format s = '<?xml version="1.0" ?>' s += '<FootprintFeed schemaVersion="0.1">' s += '<FeedInfo>' # TODO: assign provider IDs? s += '<providerID>104</providerID>' s += '<providerName>volunteermatch.org</providerName>' s += '<feedID>1</feedID>' s += '<providerURL>http://www.volunteermatch.org/</providerURL>' s += '<createdDateTime>%s</createdDateTime>' % (pubdate) s += '<description></description>' s += '</FeedInfo>' numorgs = numopps = 0 # hardcoded: Organization s += '<Organizations>' items = xmldoc.getElementsByTagName("listing") if (maxrecs > items.length or maxrecs == -1): maxrecs = items.length for item in items[0:maxrecs]: orgs = item.getElementsByTagName("parent") if (orgs.length == 1): org = orgs[0] s += '<Organization>' s += '<organizationID>%s</organizationID>' % (xmlh.get_tag_val(org, "key")) s += '<nationalEIN></nationalEIN>' s += '<name>%s</name>' % (xmlh.get_tag_val(org, "name")) s += '<missionStatement></missionStatement>' s += '<description></description>' s += '<location><city></city><region></region><postalCode></postalCode></location>' s += '<organizationURL>%s</organizationURL>' % (xmlh.get_tag_val(org, "URL")) s += '<donateURL></donateURL>' s += '<logoURL></logoURL>' s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(org, "detailURL")) s += '</Organization>' numorgs += 1 else: print datetime.now(), "parse_volunteermatch: listing does not have an organization" return None s += '</Organizations>' s += '<VolunteerOpportunities>' items = xmldoc.getElementsByTagName("listing") for item in items[0:maxrecs]: s += '<VolunteerOpportunity>' s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (xmlh.get_tag_val(item, "key")) orgs = item.getElementsByTagName("parent") if (orgs.length == 1): org = orgs[0] s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (xmlh.get_tag_val(org, "key")) else: s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' print datetime.now(), "parse_volunteermatch: listing does not have an organization" s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title")) s += '<volunteersNeeded>-8888</volunteersNeeded>' s += '<dateTimeDurations><dateTimeDuration>' durations = xmlh.get_children_by_tagname(item, "duration") if (len(durations) == 1): duration = durations[0] ongoing = duration.getAttribute("ongoing") if (ongoing == 'true'): s += '<openEnded>Yes</openEnded>' else: s += '<openEnded>No</openEnded>' listingTimes = duration.getElementsByTagName("listingTime") if (listingTimes.length == 1): listingTime = listingTimes[0] s += '<startTime>%s</startTime>' % (xmlh.get_tag_val(listingTime, "startTime")) s += '<endTime>%s</endTime>' % (xmlh.get_tag_val(listingTime, "endTime")) else: print datetime.now(), "parse_volunteermatch: number of durations in item != 1" return None commitments = item.getElementsByTagName("commitment") l_period = l_duration = "" if (commitments.length == 1): commitment = commitments[0] l_num = xmlh.get_tag_val(commitment, "num") l_duration = xmlh.get_tag_val(commitment, "duration") l_period = xmlh.get_tag_val(commitment, "period") if ((l_duration == "hours") and (l_period == "week")): s += '<commitmentHoursPerWeek>' + l_num + '</commitmentHoursPerWeek>' elif ((l_duration == "hours") and (l_period == "day")): # note: weekdays only s += '<commitmentHoursPerWeek>' + str(int(l_num)*5) + '</commitmentHoursPerWeek>' elif ((l_duration == "hours") and (l_period == "month")): hrs = int(float(l_num)/4.0) if hrs < 1: hrs = 1 s += '<commitmentHoursPerWeek>' + str(hrs) + '</commitmentHoursPerWeek>' elif ((l_duration == "hours") and (l_period == "event")): # TODO: ignore for now, later compute the endTime if not already provided pass else: print datetime.now(), "parse_volunteermatch: commitment given in units != hours/week: ", l_duration, "per", l_period s += '</dateTimeDuration></dateTimeDurations>' dbaddresses = item.getElementsByTagName("location") if (dbaddresses.length != 1): print datetime.now(), "parse_volunteermatch: only 1 location supported." return None dbaddress = dbaddresses[0] s += '<locations><location>' s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val(dbaddress, "street1")) s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "city")) s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "region")) s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val(dbaddress, "postalCode")) geolocs = item.getElementsByTagName("geolocation") if (geolocs.length == 1): geoloc = geolocs[0] s += '<latitude>%s</latitude>' % (xmlh.get_tag_val(geoloc, "latitude")) s += '<longitude>%s</longitude>' % (xmlh.get_tag_val(geoloc, "longitude")) s += '</location></locations>' s += '<audienceTags>' audiences = item.getElementsByTagName("audience") for audience in audiences: type = xmlh.node_data(audience) s += '<audienceTag>%s</audienceTag>' % (type) s += '</audienceTags>' s += '<categoryTags>' categories = item.getElementsByTagName("category") for category in categories: type = xmlh.node_data(category) s += '<categoryTag>%s</categoryTag>' % (type) s += '</categoryTags>' s += '<skills>%s</skills>' % (xmlh.get_tag_val(item, "skill")) s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(item, "detailURL")) s += '<description>%s</description>' % (xmlh.get_tag_val(item, "description")) expires = xmlh.get_tag_val(item, "expires") ts = dateutil.parser.parse(expires) expires = ts.strftime("%Y-%m-%dT%H:%M:%S") s += '<expires>%s</expires>' % (expires) s += '</VolunteerOpportunity>' numopps += 1 s += '</VolunteerOpportunities>' s += '</FootprintFeed>' #s = re.sub(r'><([^/])', r'>\n<\1', s) #print(s) return s, numorgs, numopps
def parse_fast(instr, maxrecs, progress): """fast parser but doesn't check correctness, i.e. must be pre-checked by caller.""" numorgs = numopps = 0 outstr_list = ['<?xml version="1.0" ?>'] outstr_list.append('<FootprintFeed schemaVersion="0.1">') # note: processes Organizations first, so ID lookups work for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) xmlh.set_default_value(node, node.firstChild, "feedID", "0") set_default_time_elem(node, node.firstChild, "createdDateTime") outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('<Organizations>') for match in re.finditer(re.compile('<Organization>.+?</Organization>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numorgs += 1 outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('</Organizations>') outstr_list.append('<VolunteerOpportunities>') for match in re.finditer(re.compile( '<VolunteerOpportunity>.+?</VolunteerOpportunity>', re.DOTALL), instr): opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numopps += 1 if (maxrecs > 0 and numopps > maxrecs): break #if progress and numopps % 250 == 0: # print datetime.now(), ": ", numopps, " records generated." # these set_default_* functions dont do anything if the field # doesnt already exists xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888) xmlh.set_default_value(opp, opp, "paid", "No") xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither") xmlh.set_default_value(opp, opp, "language", "English") set_default_time_elem(opp, opp, "lastUpdated") set_default_time_elem(opp, opp, "expires", xmlh.current_ts(DEFAULT_EXPIRATION)) try: opplocs = opp.getElementsByTagName("location") except: opplocs = [] for loc in opplocs: xmlh.set_default_value(opp, loc, "virtual", "No") xmlh.set_default_value(opp, loc, "country", "US") try: dttms = opp.getElementsByTagName("dateTimeDurations") except: dttms = [] for dttm in dttms: # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No") xmlh.set_default_value(opp, dttm, "iCalRecurrence", "") if (dttm.getElementsByTagName("startTime") == None and dttm.getElementsByTagName("endTime") == None): set_default_time_elem(opp, dttm, "timeFlexible", "Yes") else: set_default_time_elem(opp, dttm, "timeFlexible", "No") xmlh.set_default_value(opp, dttm, "openEnded", "No") try: time_elems = opp.getElementsByTagName("startTime") time_elems += opp.getElementsByTagName("endTime") except: time_elems = [] for el in time_elems: xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles") str_opp = xmlh.prettyxml(opp, True) outstr_list.append(str_opp) outstr_list.append('</VolunteerOpportunities>') outstr_list.append('</FootprintFeed>') return "".join(outstr_list), numorgs, numopps
def parse_fast(instr, maxrecs, progress): """fast parser but doesn't check correctness, i.e. must be pre-checked by caller.""" numorgs = numopps = 0 outstr_list = ['<?xml version="1.0" ?>'] outstr_list.append('<FootprintFeed schemaVersion="0.1">') # note: processes Organizations first, so ID lookups work for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) xmlh.set_default_value(node, node.firstChild, "feedID", "0") set_default_time_elem(node, node.firstChild, "createdDateTime") outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('<Organizations>') for match in re.finditer( re.compile('<Organization>.+?</Organization>', re.DOTALL), instr): node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numorgs += 1 outstr_list.append(xmlh.prettyxml(node, True)) outstr_list.append('</Organizations>') outstr_list.append('<VolunteerOpportunities>') for match in re.finditer( re.compile('<VolunteerOpportunity>.+?</VolunteerOpportunity>', re.DOTALL), instr): opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False) numopps += 1 if (maxrecs > 0 and numopps > maxrecs): break #if progress and numopps % 250 == 0: # print datetime.now(), ": ", numopps, " records generated." # these set_default_* functions dont do anything if the field # doesnt already exists xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888) xmlh.set_default_value(opp, opp, "paid", "No") xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither") xmlh.set_default_value(opp, opp, "language", "English") set_default_time_elem(opp, opp, "lastUpdated") set_default_time_elem(opp, opp, "expires", xmlh.current_ts(DEFAULT_EXPIRATION)) try: opplocs = opp.getElementsByTagName("location") except: opplocs = [] for loc in opplocs: xmlh.set_default_value(opp, loc, "virtual", "No") xmlh.set_default_value(opp, loc, "country", "US") try: dttms = opp.getElementsByTagName("dateTimeDurations") except: dttms = [] for dttm in dttms: # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No") xmlh.set_default_value(opp, dttm, "iCalRecurrence", "") if (dttm.getElementsByTagName("startTime") == None and dttm.getElementsByTagName("endTime") == None): set_default_time_elem(opp, dttm, "timeFlexible", "Yes") else: set_default_time_elem(opp, dttm, "timeFlexible", "No") xmlh.set_default_value(opp, dttm, "openEnded", "No") try: time_elems = opp.getElementsByTagName("startTime") time_elems += opp.getElementsByTagName("endTime") except: time_elems = [] for el in time_elems: xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles") str_opp = xmlh.prettyxml(opp, True) outstr_list.append(str_opp) outstr_list.append('</VolunteerOpportunities>') outstr_list.append('</FootprintFeed>') return "".join(outstr_list), numorgs, numopps
def parse(s, maxrecs, progress): """return FPXML given volunteermatch data""" # TODO: progress known_elnames = [ 'feed', 'title', 'subtitle', 'div', 'span', 'updated', 'id', 'link', 'icon', 'logo', 'author', 'name', 'uri', 'email', 'rights', 'entry', 'published', 'g:publish_date', 'g:expiration_date', 'g:event_date_range', 'g:start', 'g:end', 'updated', 'category', 'summary', 'content', 'awb:city', 'awb:country', 'awb:state', 'awb:postalcode', 'g:location', 'g:age_range', 'g:employer', 'g:job_type', 'g:job_industry', 'awb:paid', ] xmldoc = xmlh.simple_parser(s, known_elnames, progress) pubdate = xmlh.get_tag_val(xmldoc, "created") ts = dateutil.parser.parse(pubdate) pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S") # convert to footprint format s = '<?xml version="1.0" ?>' s += '<FootprintFeed schemaVersion="0.1">' s += '<FeedInfo>' # TODO: assign provider IDs? s += '<providerID>104</providerID>' s += '<providerName>volunteermatch.org</providerName>' s += '<feedID>1</feedID>' s += '<providerURL>http://www.volunteermatch.org/</providerURL>' s += '<createdDateTime>%s</createdDateTime>' % (pubdate) s += '<description></description>' s += '</FeedInfo>' numorgs = numopps = 0 # hardcoded: Organization s += '<Organizations>' items = xmldoc.getElementsByTagName("listing") if (maxrecs > items.length or maxrecs == -1): maxrecs = items.length for item in items[0:maxrecs]: orgs = item.getElementsByTagName("parent") if (orgs.length == 1): org = orgs[0] s += '<Organization>' s += '<organizationID>%s</organizationID>' % (xmlh.get_tag_val( org, "key")) s += '<nationalEIN></nationalEIN>' s += '<name>%s</name>' % (xmlh.get_tag_val(org, "name")) s += '<missionStatement></missionStatement>' s += '<description></description>' s += '<location><city></city><region></region><postalCode></postalCode></location>' s += '<organizationURL>%s</organizationURL>' % (xmlh.get_tag_val( org, "URL")) s += '<donateURL></donateURL>' s += '<logoURL></logoURL>' s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val( org, "detailURL")) s += '</Organization>' numorgs += 1 else: print datetime.now( ), "parse_volunteermatch: listing does not have an organization" return None s += '</Organizations>' s += '<VolunteerOpportunities>' items = xmldoc.getElementsByTagName("listing") for item in items[0:maxrecs]: s += '<VolunteerOpportunity>' s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( xmlh.get_tag_val(item, "key")) orgs = item.getElementsByTagName("parent") if (orgs.length == 1): org = orgs[0] s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % ( xmlh.get_tag_val(org, "key")) else: s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' print datetime.now( ), "parse_volunteermatch: listing does not have an organization" s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title")) s += '<volunteersNeeded>-8888</volunteersNeeded>' s += '<dateTimeDurations><dateTimeDuration>' durations = xmlh.get_children_by_tagname(item, "duration") if (len(durations) == 1): duration = durations[0] ongoing = duration.getAttribute("ongoing") if (ongoing == 'true'): s += '<openEnded>Yes</openEnded>' else: s += '<openEnded>No</openEnded>' listingTimes = duration.getElementsByTagName("listingTime") if (listingTimes.length == 1): listingTime = listingTimes[0] s += '<startTime>%s</startTime>' % (xmlh.get_tag_val( listingTime, "startTime")) s += '<endTime>%s</endTime>' % (xmlh.get_tag_val( listingTime, "endTime")) else: print datetime.now( ), "parse_volunteermatch: number of durations in item != 1" return None commitments = item.getElementsByTagName("commitment") l_period = l_duration = "" if (commitments.length == 1): commitment = commitments[0] l_num = xmlh.get_tag_val(commitment, "num") l_duration = xmlh.get_tag_val(commitment, "duration") l_period = xmlh.get_tag_val(commitment, "period") if ((l_duration == "hours") and (l_period == "week")): s += '<commitmentHoursPerWeek>' + l_num + '</commitmentHoursPerWeek>' elif ((l_duration == "hours") and (l_period == "day")): # note: weekdays only s += '<commitmentHoursPerWeek>' + str( int(l_num) * 5) + '</commitmentHoursPerWeek>' elif ((l_duration == "hours") and (l_period == "month")): hrs = int(float(l_num) / 4.0) if hrs < 1: hrs = 1 s += '<commitmentHoursPerWeek>' + str( hrs) + '</commitmentHoursPerWeek>' elif ((l_duration == "hours") and (l_period == "event")): # TODO: ignore for now, later compute the endTime if not already provided pass else: print datetime.now( ), "parse_volunteermatch: commitment given in units != hours/week: ", l_duration, "per", l_period s += '</dateTimeDuration></dateTimeDurations>' dbaddresses = item.getElementsByTagName("location") if (dbaddresses.length != 1): print datetime.now( ), "parse_volunteermatch: only 1 location supported." return None dbaddress = dbaddresses[0] s += '<locations><location>' s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val( dbaddress, "street1")) s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "city")) s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "region")) s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val( dbaddress, "postalCode")) geolocs = item.getElementsByTagName("geolocation") if (geolocs.length == 1): geoloc = geolocs[0] s += '<latitude>%s</latitude>' % (xmlh.get_tag_val( geoloc, "latitude")) s += '<longitude>%s</longitude>' % (xmlh.get_tag_val( geoloc, "longitude")) s += '</location></locations>' s += '<audienceTags>' audiences = item.getElementsByTagName("audience") for audience in audiences: type = xmlh.node_data(audience) s += '<audienceTag>%s</audienceTag>' % (type) s += '</audienceTags>' s += '<categoryTags>' categories = item.getElementsByTagName("category") for category in categories: type = xmlh.node_data(category) s += '<categoryTag>%s</categoryTag>' % (type) s += '</categoryTags>' s += '<skills>%s</skills>' % (xmlh.get_tag_val(item, "skill")) s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val( item, "detailURL")) s += '<description>%s</description>' % (xmlh.get_tag_val( item, "description")) expires = xmlh.get_tag_val(item, "expires") ts = dateutil.parser.parse(expires) expires = ts.strftime("%Y-%m-%dT%H:%M:%S") s += '<expires>%s</expires>' % (expires) s += '</VolunteerOpportunity>' numopps += 1 s += '</VolunteerOpportunities>' s += '</FootprintFeed>' #s = re.sub(r'><([^/])', r'>\n<\1', s) #print(s) return s, numorgs, numopps
def parse(instr, maxrecs, progress): numorgs = numopps = 0 instr = re.sub(r'<(/?db):', r'<\1_', instr) opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>', instr, re.DOTALL) volopps = "" for i, oppstr in enumerate(opps): #if progress and i > 0 and i % 250 == 0: # print str(datetime.now())+": ", i, " opportunities processed." if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(oppstr, known_elnames, progress=False) orgid = register_org(item) # logoURL -- sigh, this is for the opportunity not the org volopps += '<VolunteerOpportunity>' volopps += xmlh.output_val('volunteerOpportunityID', str(i)) volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid)) volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID") volopps += xmlh.output_node('title', item, "Title") volopps += xmlh.output_node('abstract', item, "Description") volopps += xmlh.output_node('description', item, "Description") volopps += xmlh.output_node('detailURL', item, "DetailURL") volopps += xmlh.output_val('volunteersNeeded', "-8888") try: oppdates = item.getElementsByTagName("OpportunityDate") except: oppdates = [] if len(oppdates) > 1: print datetime.now(), \ "parse_servenet.py: only 1 OpportunityDate supported." #return None oppdate = oppdates[0] elif len(oppdates) == 0: oppdate = None else: oppdate = oppdates[0] volopps += '<dateTimeDurations><dateTimeDuration>' if oppdate: volopps += xmlh.output_val('openEnded', 'No') volopps += xmlh.output_val('duration', 'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"), xmlh.get_tag_val(oppdate, "DurationUnit"))) volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += xmlh.output_node('startDate', oppdate, "StartDate") volopps += xmlh.output_node('endDate', oppdate, "EndDate") else: volopps += xmlh.output_val('openEnded', 'Yes') volopps += xmlh.output_val('commitmentHoursPerWeek', '0') volopps += '</dateTimeDuration></dateTimeDurations>' volopps += '<locations>' try: opplocs = item.getElementsByTagName("Location") except: opplocs = [] for opploc in opplocs: volopps += '<location>' virtual_tag = opploc.getElementsByTagName("Virtual") if virtual_tag and xmlh.get_tag_val(opploc, "Virtual").lower() == "yes": volopps += xmlh.output_val('virtual', 'Yes') else: volopps += xmlh.output_node('region', opploc, "StateOrProvince") volopps += xmlh.output_node('country', opploc, "Country") volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode") volopps += '</location>' volopps += '</locations>' volopps += '<categoryTags/>' volopps += '</VolunteerOpportunity>' numopps += 1 # convert to footprint format outstr = '<?xml version="1.0" ?>' outstr += '<FootprintFeed schemaVersion="0.1">' outstr += '<FeedInfo>' outstr += xmlh.output_val('providerID', providerID) outstr += xmlh.output_val('providerName', providerName) outstr += xmlh.output_val('feedID', feedID) outstr += xmlh.output_val('createdDateTime', xmlh.current_ts()) outstr += xmlh.output_val('providerURL', providerURL) outstr += xmlh.output_val('description', feedDescription) # TODO: capture ts -- use now?! outstr += '</FeedInfo>' # hardcoded: Organization outstr += '<Organizations>' for key in ORGS: outstr += ORGS[key] numorgs += 1 outstr += '</Organizations>' outstr += '<VolunteerOpportunities>' outstr += volopps outstr += '</VolunteerOpportunities>' outstr += '</FootprintFeed>' #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr) return outstr, numorgs, numopps
'sensor' : 'false', 'clientID' : CLIENT_ID}) try: maps_fh = urllib2.urlopen("http://maps.google.com/maps/geo?%s" % params) res = maps_fh.read() maps_fh.close() except IOError, err: print_debug("geocode_call: Error contacting Maps API. Sleeping. " + str(err)) time.sleep(1) return geocode_call(query, retries - 1) #print_debug("response length: "+str(len(res))) if re.search(r'403 Forbidden', res): respcode = 403 else: node = xmlh.simple_parser(res, [], False) respcode = xmlh.get_tag_val(node, "code") if respcode == "": #print_debug("unparseable response: "+res) return False respcode = int(respcode) if respcode in (400, 601, 602, 603): # problem with the query return None if respcode in (403, 500, 620): # problem with the server print_debug("geocode_call: Connection problem or quota exceeded. Sleeping...") if retries == 4: xmlh.print_progress("geocoder: %d" % respcode, "", SHOW_PROGRESS) time.sleep(5) return geocode_call(query, retries - 1)
def parse(instr, maxrecs, progress): """return FPXML given usaservice data""" # TODO: progress known_elnames = [ 'channel', 'db:abstract', 'db:address', 'db:attendee_count', 'db:categories', 'db:city', 'db:country', 'db:county', 'db:dateTime', 'db:event', 'db:eventType', 'db:guest_total', 'db:host', 'db:latitude', 'db:length', 'db:longitude', 'db:rsvp', 'db:scheduledTime', 'db:state', 'db:street', 'db:title', 'db:venue_name', 'db:zipcode', 'description', 'docs', 'guid', 'item', 'language', 'link', 'pubDate', 'rss', 'title', ] # convert to footprint format s = '<?xml version="1.0" ?>' s += '<FootprintFeed schemaVersion="0.1">' s += '<FeedInfo>' # TODO: assign provider IDs? s += '<providerID>101</providerID>' s += '<providerName>usaservice.org</providerName>' s += '<feedID>1</feedID>' s += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts() s += '<providerURL>http://www.usaservice.org/</providerURL>' s += '<description>Syndicated events</description>' # TODO: capture ts -- use now?! s += '</FeedInfo>' numorgs = numopps = 0 # hardcoded: Organization s += '<Organizations>' s += '<Organization>' s += '<organizationID>0</organizationID>' s += '<nationalEIN></nationalEIN>' s += '<name></name>' s += '<missionStatement></missionStatement>' s += '<description></description>' s += '<location><city></city><region></region><postalCode></postalCode></location>' s += '<organizationURL></organizationURL>' s += '<donateURL></donateURL>' s += '<logoURL></logoURL>' s += '<detailURL></detailURL>' s += '</Organization>' numorgs += 1 s += '</Organizations>' s += '<VolunteerOpportunities>' instr = re.sub(r'<(/?db):', r'<\1_', instr) for i, line in enumerate(instr.splitlines()): if (maxrecs > 0 and i > maxrecs): break xmlh.print_rps_progress("opps", progress, i, maxrecs) item = xmlh.simple_parser(line, known_elnames, progress=False) # unmapped: db_rsvp (seems to be same as link, but with #rsvp at end of url?) # unmapped: db_host (no equivalent?) # unmapped: db_county (seems to be empty) # unmapped: attendee_count # unmapped: guest_total # unmapped: db_title (dup of title, above) s += '<VolunteerOpportunity>' s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % ( xmlh.get_tag_val(item, "guid")) # hardcoded: sponsoringOrganizationID s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>' # hardcoded: volunteerHubOrganizationID s += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>' s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title")) s += '<abstract>%s</abstract>' % (xmlh.get_tag_val(item, "abstract")) s += '<volunteersNeeded>-8888</volunteersNeeded>' dbscheduledTimes = item.getElementsByTagName("db_scheduledTime") if (dbscheduledTimes.length != 1): print datetime.now( ), "parse_usaservice: only 1 db_scheduledTime supported." return None dbscheduledTime = dbscheduledTimes[0] s += '<dateTimeDurations><dateTimeDuration>' length = xmlh.get_tag_val(dbscheduledTime, "db_length") if length == "" or length == "-1": s += '<openEnded>Yes</openEnded>' else: s += '<openEnded>No</openEnded>' date, time = xmlh.get_tag_val(dbscheduledTime, "db_dateTime").split(" ") s += '<startDate>%s</startDate>' % (date) # TODO: timezone??? s += '<startTime>%s</startTime>' % (time) s += '</dateTimeDuration></dateTimeDurations>' dbaddresses = item.getElementsByTagName("db_address") if (dbaddresses.length != 1): print datetime.now( ), "parse_usaservice: only 1 db_address supported." return None dbaddress = dbaddresses[0] s += '<locations><location>' s += '<name>%s</name>' % (xmlh.get_tag_val(item, "db_venue_name")) s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val( dbaddress, "db_street")) s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "db_city")) s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "db_state")) s += '<country>%s</country>' % (xmlh.get_tag_val( dbaddress, "db_country")) s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val( dbaddress, "db_zipcode")) s += '<latitude>%s</latitude>' % (xmlh.get_tag_val( item, "db_latitude")) s += '<longitude>%s</longitude>' % (xmlh.get_tag_val( item, "db_longitude")) s += '</location></locations>' type = xmlh.get_tag_val(item, "db_eventType") s += '<categoryTags><categoryTag>%s</categoryTag></categoryTags>' % ( type) s += '<contactName>%s</contactName>' % xmlh.get_tag_val( item, "db_host") s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(item, "link")) s += '<description>%s</description>' % (xmlh.get_tag_val( item, "description")) pubdate = xmlh.get_tag_val(item, "pubDate") if re.search("[0-9][0-9] [A-Z][a-z][a-z] [0-9][0-9][0-9][0-9]", pubdate): # TODO: parse() is ignoring timzone... ts = dateutil.parser.parse(pubdate) pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S") s += '<lastUpdated>%s</lastUpdated>' % (pubdate) s += '</VolunteerOpportunity>' numopps += 1 s += '</VolunteerOpportunities>' s += '</FootprintFeed>' #s = re.sub(r'><([^/])', r'>\n<\1', s) return s, numorgs, numopps