Python simple_parserの例、xml_helpers.simple_parser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: parse_footprint.py プロジェクト: Alwnikrotikz/allforgood

def parse(instr, maxrecs, progress):
  """return python DOM object given FPXML"""
  # parsing footprint format is the identity operation
  if progress:
    print datetime.now(), "parse_footprint: parsing ", len(instr), " bytes."
  xmldoc = xmlh.simple_parser(instr, KNOWN_ELEMENTS, progress)
  if progress:
    print datetime.now(), "parse_footprint: done parsing."
  return xmldoc

コード例 #2

0

ファイルを表示

ファイル: parse_footprint.py プロジェクト: thegooglecodearchive/allforgood

def parse(instr, maxrecs, progress):
    """return python DOM object given FPXML"""
    # parsing footprint format is the identity operation
    if progress:
        print datetime.now(), "parse_footprint: parsing ", len(
            instr), " bytes."
    xmldoc = xmlh.simple_parser(instr, KNOWN_ELEMENTS, progress)
    if progress:
        print datetime.now(), "parse_footprint: done parsing."
    return xmldoc

コード例 #3

0

ファイルを表示

    def parse(instr, maxrecs, progress):
        numorgs = numopps = 0
        instr = re.sub(r'<(/?db):', r'<\1_', instr)
        opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>',
                          instr, re.DOTALL)
        volopps = ""
        for i, oppstr in enumerate(opps):
            #if progress and i > 0 and i % 250 == 0:
            #  print str(datetime.now())+": ", i, " opportunities processed."
            if (maxrecs > 0 and i > maxrecs):
                break
            xmlh.print_rps_progress("opps", progress, i, maxrecs)

            item = xmlh.simple_parser(oppstr, known_elnames, progress=False)

            orgid = register_org(item)

            # logoURL -- sigh, this is for the opportunity not the org
            volopps += '<VolunteerOpportunity>'
            volopps += xmlh.output_val('volunteerOpportunityID', str(i))
            volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid))
            volopps += xmlh.output_node('volunteerHubOrganizationID', item,
                                        "LocalID")
            volopps += xmlh.output_node('title', item, "Title")
            volopps += xmlh.output_node('abstract', item, "Description")
            volopps += xmlh.output_node('description', item, "Description")
            volopps += xmlh.output_node('detailURL', item, "DetailURL")
            volopps += xmlh.output_val('volunteersNeeded', "-8888")

            try:
                oppdates = item.getElementsByTagName("OpportunityDate")
            except:
                oppdates = []

            if len(oppdates) > 1:
                print datetime.now(), \
                    "parse_servenet.py: only 1 OpportunityDate supported."
                #return None
                oppdate = oppdates[0]
            elif len(oppdates) == 0:
                oppdate = None
            else:
                oppdate = oppdates[0]
            volopps += '<dateTimeDurations><dateTimeDuration>'

            if oppdate:
                volopps += xmlh.output_val('openEnded', 'No')
                volopps += xmlh.output_val(
                    'duration',
                    'P%s%s' % (xmlh.get_tag_val(oppdate, "DurationQuantity"),
                               xmlh.get_tag_val(oppdate, "DurationUnit")))
                volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
                volopps += xmlh.output_node('startDate', oppdate, "StartDate")
                volopps += xmlh.output_node('endDate', oppdate, "EndDate")
            else:
                volopps += xmlh.output_val('openEnded', 'Yes')
                volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
            volopps += '</dateTimeDuration></dateTimeDurations>'

            volopps += '<locations>'
            try:
                opplocs = item.getElementsByTagName("Location")
            except:
                opplocs = []
            for opploc in opplocs:
                volopps += '<location>'
                virtual_tag = opploc.getElementsByTagName("Virtual")
                if virtual_tag and xmlh.get_tag_val(
                        opploc, "Virtual").lower() == "yes":
                    volopps += xmlh.output_val('virtual', 'Yes')
                else:
                    volopps += xmlh.output_node('region', opploc,
                                                "StateOrProvince")
                    volopps += xmlh.output_node('country', opploc, "Country")
                    volopps += xmlh.output_node('postalCode', opploc,
                                                "ZipOrPostalCode")
                volopps += '</location>'
            volopps += '</locations>'
            volopps += '<categoryTags/>'
            volopps += '</VolunteerOpportunity>'
            numopps += 1

        # convert to footprint format
        outstr = '<?xml version="1.0" ?>'
        outstr += '<FootprintFeed schemaVersion="0.1">'
        outstr += '<FeedInfo>'
        outstr += xmlh.output_val('providerID', providerID)
        outstr += xmlh.output_val('providerName', providerName)
        outstr += xmlh.output_val('feedID', feedID)
        outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
        outstr += xmlh.output_val('providerURL', providerURL)
        outstr += xmlh.output_val('description', feedDescription)
        # TODO: capture ts -- use now?!
        outstr += '</FeedInfo>'

        # hardcoded: Organization
        outstr += '<Organizations>'
        for key in ORGS:
            outstr += ORGS[key]
            numorgs += 1
        outstr += '</Organizations>'
        outstr += '<VolunteerOpportunities>'
        outstr += volopps
        outstr += '</VolunteerOpportunities>'
        outstr += '</FootprintFeed>'

        #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr)
        return outstr, numorgs, numopps

コード例 #4

0

ファイルを表示

ファイル: parse_volunteermatch.py プロジェクト: Alwnikrotikz/allforgood

def parse(s, maxrecs, progress):
  """return FPXML given volunteermatch data"""
  # TODO: progress
  known_elnames = ['feed', 'title', 'subtitle', 'div', 'span', 'updated', 'id', 'link', 'icon', 'logo', 'author', 'name', 'uri', 'email', 'rights', 'entry', 'published', 'g:publish_date', 'g:expiration_date', 'g:event_date_range', 'g:start', 'g:end', 'updated', 'category', 'summary', 'content', 'awb:city', 'awb:country', 'awb:state', 'awb:postalcode', 'g:location', 'g:age_range', 'g:employer', 'g:job_type', 'g:job_industry', 'awb:paid', ]
  xmldoc = xmlh.simple_parser(s, known_elnames, progress)

  pubdate = xmlh.get_tag_val(xmldoc, "created")
  ts = dateutil.parser.parse(pubdate)
  pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S")

  # convert to footprint format
  s = '<?xml version="1.0" ?>'
  s += '<FootprintFeed schemaVersion="0.1">'
  s += '<FeedInfo>'
  # TODO: assign provider IDs?
  s += '<providerID>104</providerID>'
  s += '<providerName>volunteermatch.org</providerName>'
  s += '<feedID>1</feedID>'
  s += '<providerURL>http://www.volunteermatch.org/</providerURL>'
  s += '<createdDateTime>%s</createdDateTime>' % (pubdate)
  s += '<description></description>' 
  s += '</FeedInfo>'

  numorgs = numopps = 0

  # hardcoded: Organization
  s += '<Organizations>'
  items = xmldoc.getElementsByTagName("listing")
  if (maxrecs > items.length or maxrecs == -1):
    maxrecs = items.length
    
  for item in items[0:maxrecs]:
    orgs = item.getElementsByTagName("parent")
    if (orgs.length == 1):
      org = orgs[0]
      s += '<Organization>'
      s += '<organizationID>%s</organizationID>' % (xmlh.get_tag_val(org, "key"))
      s += '<nationalEIN></nationalEIN>'
      s += '<name>%s</name>' % (xmlh.get_tag_val(org, "name"))
      s += '<missionStatement></missionStatement>'
      s += '<description></description>'
      s += '<location><city></city><region></region><postalCode></postalCode></location>'
      s += '<organizationURL>%s</organizationURL>' % (xmlh.get_tag_val(org, "URL"))
      s += '<donateURL></donateURL>'
      s += '<logoURL></logoURL>'
      s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(org, "detailURL"))
      s += '</Organization>'
      numorgs += 1
    else:
      print datetime.now(), "parse_volunteermatch: listing does not have an organization"
      return None

  s += '</Organizations>'
    
  s += '<VolunteerOpportunities>'
  items = xmldoc.getElementsByTagName("listing")
  for item in items[0:maxrecs]:
    s += '<VolunteerOpportunity>'
    s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (xmlh.get_tag_val(item, "key"))

    orgs = item.getElementsByTagName("parent")
    if (orgs.length == 1):
      org = orgs[0]
      s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (xmlh.get_tag_val(org, "key"))
    else:
      s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>'
      print datetime.now(), "parse_volunteermatch: listing does not have an organization"
      
    s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title"))

    s += '<volunteersNeeded>-8888</volunteersNeeded>'

    s += '<dateTimeDurations><dateTimeDuration>'
    durations = xmlh.get_children_by_tagname(item, "duration")
    if (len(durations) == 1):
      duration = durations[0]
      ongoing = duration.getAttribute("ongoing")
      if (ongoing == 'true'):
        s += '<openEnded>Yes</openEnded>'
      else:
        s += '<openEnded>No</openEnded>'
          
      listingTimes = duration.getElementsByTagName("listingTime")
      if (listingTimes.length == 1):
        listingTime = listingTimes[0]
        s += '<startTime>%s</startTime>' % (xmlh.get_tag_val(listingTime, "startTime"))
        s += '<endTime>%s</endTime>' % (xmlh.get_tag_val(listingTime, "endTime"))
    else:
      print datetime.now(), "parse_volunteermatch: number of durations in item != 1"
      return None
        
    commitments = item.getElementsByTagName("commitment")
    l_period = l_duration = ""
    if (commitments.length == 1):
      commitment = commitments[0]
      l_num = xmlh.get_tag_val(commitment, "num")
      l_duration = xmlh.get_tag_val(commitment, "duration")
      l_period = xmlh.get_tag_val(commitment, "period")
      if ((l_duration == "hours") and (l_period == "week")):
        s += '<commitmentHoursPerWeek>' + l_num + '</commitmentHoursPerWeek>'
      elif ((l_duration == "hours") and (l_period == "day")):
        # note: weekdays only
        s += '<commitmentHoursPerWeek>' + str(int(l_num)*5) + '</commitmentHoursPerWeek>'
      elif ((l_duration == "hours") and (l_period == "month")):
        hrs = int(float(l_num)/4.0)
        if hrs < 1: hrs = 1
        s += '<commitmentHoursPerWeek>' + str(hrs) + '</commitmentHoursPerWeek>'
      elif ((l_duration == "hours") and (l_period == "event")):
        # TODO: ignore for now, later compute the endTime if not already provided
        pass
      else:
        print datetime.now(), "parse_volunteermatch: commitment given in units != hours/week: ", l_duration, "per", l_period
        
    s += '</dateTimeDuration></dateTimeDurations>'

    dbaddresses = item.getElementsByTagName("location")
    if (dbaddresses.length != 1):
      print datetime.now(), "parse_volunteermatch: only 1 location supported."
      return None
    dbaddress = dbaddresses[0]
    s += '<locations><location>'
    s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val(dbaddress, "street1"))
    s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "city"))
    s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "region"))
    s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val(dbaddress, "postalCode"))
    
    geolocs = item.getElementsByTagName("geolocation")
    if (geolocs.length == 1):
      geoloc = geolocs[0]
      s += '<latitude>%s</latitude>' % (xmlh.get_tag_val(geoloc, "latitude"))
      s += '<longitude>%s</longitude>' % (xmlh.get_tag_val(geoloc, "longitude"))
    
    s += '</location></locations>'
    
    s += '<audienceTags>'
    audiences = item.getElementsByTagName("audience")
    for audience in audiences:
      type = xmlh.node_data(audience)
      s += '<audienceTag>%s</audienceTag>' % (type)
    s += '</audienceTags>'

    s += '<categoryTags>'
    categories = item.getElementsByTagName("category")
    for category in categories:
      type = xmlh.node_data(category)
      s += '<categoryTag>%s</categoryTag>' % (type)
    s += '</categoryTags>'

    s += '<skills>%s</skills>' % (xmlh.get_tag_val(item, "skill"))

    s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(item, "detailURL"))
    s += '<description>%s</description>' % (xmlh.get_tag_val(item, "description"))

    expires = xmlh.get_tag_val(item, "expires")
    ts = dateutil.parser.parse(expires)
    expires = ts.strftime("%Y-%m-%dT%H:%M:%S")
    s += '<expires>%s</expires>' % (expires)

    s += '</VolunteerOpportunity>'
    numopps += 1
    
  s += '</VolunteerOpportunities>'
  s += '</FootprintFeed>'

  #s = re.sub(r'><([^/])', r'>\n<\1', s)
  #print(s)
  return s, numorgs, numopps

コード例 #5

0

ファイルを表示

ファイル: parse_footprint.py プロジェクト: Alwnikrotikz/allforgood

def parse_fast(instr, maxrecs, progress):
  """fast parser but doesn't check correctness,
  i.e. must be pre-checked by caller."""
  numorgs = numopps = 0
  outstr_list = ['<?xml version="1.0" ?>']
  outstr_list.append('<FootprintFeed schemaVersion="0.1">')

  # note: processes Organizations first, so ID lookups work
  for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>',
                                      re.DOTALL), instr):
    node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
    xmlh.set_default_value(node, node.firstChild, "feedID", "0")
    set_default_time_elem(node, node.firstChild, "createdDateTime")
    outstr_list.append(xmlh.prettyxml(node, True))

  outstr_list.append('<Organizations>')
  for match in re.finditer(re.compile('<Organization>.+?</Organization>',
                                      re.DOTALL), instr):
    node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
    numorgs += 1
    outstr_list.append(xmlh.prettyxml(node, True))
  outstr_list.append('</Organizations>')
               
  outstr_list.append('<VolunteerOpportunities>')
  for match in re.finditer(re.compile(
      '<VolunteerOpportunity>.+?</VolunteerOpportunity>', re.DOTALL), instr):
    opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)

    numopps += 1
    if (maxrecs > 0 and numopps > maxrecs):
      break
    #if progress and numopps % 250 == 0:
    #  print datetime.now(), ": ", numopps, " records generated."

    # these set_default_* functions dont do anything if the field
    # doesnt already exists
    xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888)
    xmlh.set_default_value(opp, opp, "paid", "No")
    xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither")
    xmlh.set_default_value(opp, opp, "language", "English")
    set_default_time_elem(opp, opp, "lastUpdated")
    set_default_time_elem(opp, opp, "expires", 
        xmlh.current_ts(DEFAULT_EXPIRATION))
   
    try:
      opplocs = opp.getElementsByTagName("location")
    except:
      opplocs = []

    for loc in opplocs:
      xmlh.set_default_value(opp, loc, "virtual", "No")
      xmlh.set_default_value(opp, loc, "country", "US")

    try:
      dttms = opp.getElementsByTagName("dateTimeDurations")
    except:
      dttms = []

    for dttm in dttms:
      # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No")
      xmlh.set_default_value(opp, dttm, "iCalRecurrence", "")
      if (dttm.getElementsByTagName("startTime") == None and
          dttm.getElementsByTagName("endTime") == None):
        set_default_time_elem(opp, dttm, "timeFlexible", "Yes")
      else:
        set_default_time_elem(opp, dttm, "timeFlexible", "No")
      xmlh.set_default_value(opp, dttm, "openEnded", "No")

    try:
      time_elems = opp.getElementsByTagName("startTime")
      time_elems += opp.getElementsByTagName("endTime")
    except:
      time_elems = []

    for el in time_elems:
      xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles")

    str_opp = xmlh.prettyxml(opp, True)

    outstr_list.append(str_opp)

  outstr_list.append('</VolunteerOpportunities>')

  outstr_list.append('</FootprintFeed>')
  return "".join(outstr_list), numorgs, numopps

コード例 #6

0

ファイルを表示

ファイル: parse_footprint.py プロジェクト: thegooglecodearchive/allforgood

def parse_fast(instr, maxrecs, progress):
    """fast parser but doesn't check correctness,
  i.e. must be pre-checked by caller."""
    numorgs = numopps = 0
    outstr_list = ['<?xml version="1.0" ?>']
    outstr_list.append('<FootprintFeed schemaVersion="0.1">')

    # note: processes Organizations first, so ID lookups work
    for match in re.finditer(re.compile('<FeedInfo>.+?</FeedInfo>', re.DOTALL),
                             instr):
        node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
        xmlh.set_default_value(node, node.firstChild, "feedID", "0")
        set_default_time_elem(node, node.firstChild, "createdDateTime")
        outstr_list.append(xmlh.prettyxml(node, True))

    outstr_list.append('<Organizations>')
    for match in re.finditer(
            re.compile('<Organization>.+?</Organization>', re.DOTALL), instr):
        node = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)
        numorgs += 1
        outstr_list.append(xmlh.prettyxml(node, True))
    outstr_list.append('</Organizations>')

    outstr_list.append('<VolunteerOpportunities>')
    for match in re.finditer(
            re.compile('<VolunteerOpportunity>.+?</VolunteerOpportunity>',
                       re.DOTALL), instr):
        opp = xmlh.simple_parser(match.group(0), KNOWN_ELEMENTS, False)

        numopps += 1
        if (maxrecs > 0 and numopps > maxrecs):
            break
        #if progress and numopps % 250 == 0:
        #  print datetime.now(), ": ", numopps, " records generated."

        # these set_default_* functions dont do anything if the field
        # doesnt already exists
        xmlh.set_default_value(opp, opp, "volunteersNeeded", -8888)
        xmlh.set_default_value(opp, opp, "paid", "No")
        xmlh.set_default_value(opp, opp, "sexRestrictedTo", "Neither")
        xmlh.set_default_value(opp, opp, "language", "English")
        set_default_time_elem(opp, opp, "lastUpdated")
        set_default_time_elem(opp, opp, "expires",
                              xmlh.current_ts(DEFAULT_EXPIRATION))

        try:
            opplocs = opp.getElementsByTagName("location")
        except:
            opplocs = []

        for loc in opplocs:
            xmlh.set_default_value(opp, loc, "virtual", "No")
            xmlh.set_default_value(opp, loc, "country", "US")

        try:
            dttms = opp.getElementsByTagName("dateTimeDurations")
        except:
            dttms = []

        for dttm in dttms:
            # redundant xmlh.set_default_value(opp, dttm, "openEnded", "No")
            xmlh.set_default_value(opp, dttm, "iCalRecurrence", "")
            if (dttm.getElementsByTagName("startTime") == None
                    and dttm.getElementsByTagName("endTime") == None):
                set_default_time_elem(opp, dttm, "timeFlexible", "Yes")
            else:
                set_default_time_elem(opp, dttm, "timeFlexible", "No")
            xmlh.set_default_value(opp, dttm, "openEnded", "No")

        try:
            time_elems = opp.getElementsByTagName("startTime")
            time_elems += opp.getElementsByTagName("endTime")
        except:
            time_elems = []

        for el in time_elems:
            xmlh.set_default_attr(opp, el, "olsonTZ", "America/Los_Angeles")

        str_opp = xmlh.prettyxml(opp, True)

        outstr_list.append(str_opp)

    outstr_list.append('</VolunteerOpportunities>')

    outstr_list.append('</FootprintFeed>')
    return "".join(outstr_list), numorgs, numopps

コード例 #7

0

ファイルを表示

ファイル: parse_volunteermatch.py プロジェクト: thegooglecodearchive/allforgood

def parse(s, maxrecs, progress):
    """return FPXML given volunteermatch data"""
    # TODO: progress
    known_elnames = [
        'feed',
        'title',
        'subtitle',
        'div',
        'span',
        'updated',
        'id',
        'link',
        'icon',
        'logo',
        'author',
        'name',
        'uri',
        'email',
        'rights',
        'entry',
        'published',
        'g:publish_date',
        'g:expiration_date',
        'g:event_date_range',
        'g:start',
        'g:end',
        'updated',
        'category',
        'summary',
        'content',
        'awb:city',
        'awb:country',
        'awb:state',
        'awb:postalcode',
        'g:location',
        'g:age_range',
        'g:employer',
        'g:job_type',
        'g:job_industry',
        'awb:paid',
    ]
    xmldoc = xmlh.simple_parser(s, known_elnames, progress)

    pubdate = xmlh.get_tag_val(xmldoc, "created")
    ts = dateutil.parser.parse(pubdate)
    pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S")

    # convert to footprint format
    s = '<?xml version="1.0" ?>'
    s += '<FootprintFeed schemaVersion="0.1">'
    s += '<FeedInfo>'
    # TODO: assign provider IDs?
    s += '<providerID>104</providerID>'
    s += '<providerName>volunteermatch.org</providerName>'
    s += '<feedID>1</feedID>'
    s += '<providerURL>http://www.volunteermatch.org/</providerURL>'
    s += '<createdDateTime>%s</createdDateTime>' % (pubdate)
    s += '<description></description>'
    s += '</FeedInfo>'

    numorgs = numopps = 0

    # hardcoded: Organization
    s += '<Organizations>'
    items = xmldoc.getElementsByTagName("listing")
    if (maxrecs > items.length or maxrecs == -1):
        maxrecs = items.length

    for item in items[0:maxrecs]:
        orgs = item.getElementsByTagName("parent")
        if (orgs.length == 1):
            org = orgs[0]
            s += '<Organization>'
            s += '<organizationID>%s</organizationID>' % (xmlh.get_tag_val(
                org, "key"))
            s += '<nationalEIN></nationalEIN>'
            s += '<name>%s</name>' % (xmlh.get_tag_val(org, "name"))
            s += '<missionStatement></missionStatement>'
            s += '<description></description>'
            s += '<location><city></city><region></region><postalCode></postalCode></location>'
            s += '<organizationURL>%s</organizationURL>' % (xmlh.get_tag_val(
                org, "URL"))
            s += '<donateURL></donateURL>'
            s += '<logoURL></logoURL>'
            s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(
                org, "detailURL"))
            s += '</Organization>'
            numorgs += 1
        else:
            print datetime.now(
            ), "parse_volunteermatch: listing does not have an organization"
            return None

    s += '</Organizations>'

    s += '<VolunteerOpportunities>'
    items = xmldoc.getElementsByTagName("listing")
    for item in items[0:maxrecs]:
        s += '<VolunteerOpportunity>'
        s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (
            xmlh.get_tag_val(item, "key"))

        orgs = item.getElementsByTagName("parent")
        if (orgs.length == 1):
            org = orgs[0]
            s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>%s</sponsoringOrganizationID></sponsoringOrganizationIDs>' % (
                xmlh.get_tag_val(org, "key"))
        else:
            s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>'
            print datetime.now(
            ), "parse_volunteermatch: listing does not have an organization"

        s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title"))

        s += '<volunteersNeeded>-8888</volunteersNeeded>'

        s += '<dateTimeDurations><dateTimeDuration>'
        durations = xmlh.get_children_by_tagname(item, "duration")
        if (len(durations) == 1):
            duration = durations[0]
            ongoing = duration.getAttribute("ongoing")
            if (ongoing == 'true'):
                s += '<openEnded>Yes</openEnded>'
            else:
                s += '<openEnded>No</openEnded>'

            listingTimes = duration.getElementsByTagName("listingTime")
            if (listingTimes.length == 1):
                listingTime = listingTimes[0]
                s += '<startTime>%s</startTime>' % (xmlh.get_tag_val(
                    listingTime, "startTime"))
                s += '<endTime>%s</endTime>' % (xmlh.get_tag_val(
                    listingTime, "endTime"))
        else:
            print datetime.now(
            ), "parse_volunteermatch: number of durations in item != 1"
            return None

        commitments = item.getElementsByTagName("commitment")
        l_period = l_duration = ""
        if (commitments.length == 1):
            commitment = commitments[0]
            l_num = xmlh.get_tag_val(commitment, "num")
            l_duration = xmlh.get_tag_val(commitment, "duration")
            l_period = xmlh.get_tag_val(commitment, "period")
            if ((l_duration == "hours") and (l_period == "week")):
                s += '<commitmentHoursPerWeek>' + l_num + '</commitmentHoursPerWeek>'
            elif ((l_duration == "hours") and (l_period == "day")):
                # note: weekdays only
                s += '<commitmentHoursPerWeek>' + str(
                    int(l_num) * 5) + '</commitmentHoursPerWeek>'
            elif ((l_duration == "hours") and (l_period == "month")):
                hrs = int(float(l_num) / 4.0)
                if hrs < 1: hrs = 1
                s += '<commitmentHoursPerWeek>' + str(
                    hrs) + '</commitmentHoursPerWeek>'
            elif ((l_duration == "hours") and (l_period == "event")):
                # TODO: ignore for now, later compute the endTime if not already provided
                pass
            else:
                print datetime.now(
                ), "parse_volunteermatch: commitment given in units != hours/week: ", l_duration, "per", l_period

        s += '</dateTimeDuration></dateTimeDurations>'

        dbaddresses = item.getElementsByTagName("location")
        if (dbaddresses.length != 1):
            print datetime.now(
            ), "parse_volunteermatch: only 1 location supported."
            return None
        dbaddress = dbaddresses[0]
        s += '<locations><location>'
        s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val(
            dbaddress, "street1"))
        s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "city"))
        s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "region"))
        s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val(
            dbaddress, "postalCode"))

        geolocs = item.getElementsByTagName("geolocation")
        if (geolocs.length == 1):
            geoloc = geolocs[0]
            s += '<latitude>%s</latitude>' % (xmlh.get_tag_val(
                geoloc, "latitude"))
            s += '<longitude>%s</longitude>' % (xmlh.get_tag_val(
                geoloc, "longitude"))

        s += '</location></locations>'

        s += '<audienceTags>'
        audiences = item.getElementsByTagName("audience")
        for audience in audiences:
            type = xmlh.node_data(audience)
            s += '<audienceTag>%s</audienceTag>' % (type)
        s += '</audienceTags>'

        s += '<categoryTags>'
        categories = item.getElementsByTagName("category")
        for category in categories:
            type = xmlh.node_data(category)
            s += '<categoryTag>%s</categoryTag>' % (type)
        s += '</categoryTags>'

        s += '<skills>%s</skills>' % (xmlh.get_tag_val(item, "skill"))

        s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(
            item, "detailURL"))
        s += '<description>%s</description>' % (xmlh.get_tag_val(
            item, "description"))

        expires = xmlh.get_tag_val(item, "expires")
        ts = dateutil.parser.parse(expires)
        expires = ts.strftime("%Y-%m-%dT%H:%M:%S")
        s += '<expires>%s</expires>' % (expires)

        s += '</VolunteerOpportunity>'
        numopps += 1

    s += '</VolunteerOpportunities>'
    s += '</FootprintFeed>'

    #s = re.sub(r'><([^/])', r'>\n<\1', s)
    #print(s)
    return s, numorgs, numopps

コード例 #8

0

ファイルを表示

ファイル: parse_networkforgood.py プロジェクト: Alwnikrotikz/allforgood

 def parse(instr, maxrecs, progress):
   numorgs = numopps = 0
   instr = re.sub(r'<(/?db):', r'<\1_', instr)
   opps = re.findall(r'<VolunteerOpportunity>.+?</VolunteerOpportunity>',
                     instr, re.DOTALL)
   volopps = ""
   for i, oppstr in enumerate(opps):
     #if progress and i > 0 and i % 250 == 0:
     #  print str(datetime.now())+": ", i, " opportunities processed."
     if (maxrecs > 0 and i > maxrecs):
       break
     xmlh.print_rps_progress("opps", progress, i, maxrecs)
 
     item = xmlh.simple_parser(oppstr, known_elnames, progress=False)
 
     orgid = register_org(item)
 
     # logoURL -- sigh, this is for the opportunity not the org
     volopps += '<VolunteerOpportunity>'
     volopps += xmlh.output_val('volunteerOpportunityID', str(i))
     volopps += xmlh.output_val('sponsoringOrganizationID', str(orgid))
     volopps += xmlh.output_node('volunteerHubOrganizationID', item, "LocalID")
     volopps += xmlh.output_node('title', item, "Title")
     volopps += xmlh.output_node('abstract', item, "Description")
     volopps += xmlh.output_node('description', item, "Description")
     volopps += xmlh.output_node('detailURL', item, "DetailURL")
     volopps += xmlh.output_val('volunteersNeeded', "-8888")
 
     try:
       oppdates = item.getElementsByTagName("OpportunityDate")
     except:
       oppdates = []
     
     if len(oppdates) > 1:
       print datetime.now(), \
           "parse_servenet.py: only 1 OpportunityDate supported."
       #return None
       oppdate = oppdates[0]
     elif len(oppdates) == 0:
       oppdate = None
     else:
       oppdate = oppdates[0]
     volopps += '<dateTimeDurations><dateTimeDuration>'
 
     if oppdate:
       volopps += xmlh.output_val('openEnded', 'No')
       volopps += xmlh.output_val('duration', 'P%s%s' % 
                                 (xmlh.get_tag_val(oppdate, "DurationQuantity"),
                                  xmlh.get_tag_val(oppdate, "DurationUnit")))
       volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
       volopps += xmlh.output_node('startDate', oppdate, "StartDate")
       volopps += xmlh.output_node('endDate', oppdate, "EndDate")
     else:
       volopps += xmlh.output_val('openEnded', 'Yes')
       volopps += xmlh.output_val('commitmentHoursPerWeek', '0')
     volopps += '</dateTimeDuration></dateTimeDurations>'
 
     volopps += '<locations>'
     try:
       opplocs = item.getElementsByTagName("Location")
     except:
       opplocs = []
     for opploc in opplocs:
       volopps += '<location>'
       virtual_tag = opploc.getElementsByTagName("Virtual")
       if virtual_tag and xmlh.get_tag_val(opploc, "Virtual").lower() == "yes":
         volopps += xmlh.output_val('virtual', 'Yes')
       else:
         volopps += xmlh.output_node('region', opploc, "StateOrProvince")
         volopps += xmlh.output_node('country', opploc, "Country")
         volopps += xmlh.output_node('postalCode', opploc, "ZipOrPostalCode")
       volopps += '</location>'
     volopps += '</locations>'
     volopps += '<categoryTags/>'
     volopps += '</VolunteerOpportunity>'
     numopps += 1
     
   # convert to footprint format
   outstr = '<?xml version="1.0" ?>'
   outstr += '<FootprintFeed schemaVersion="0.1">'
   outstr += '<FeedInfo>'
   outstr += xmlh.output_val('providerID', providerID)
   outstr += xmlh.output_val('providerName', providerName)
   outstr += xmlh.output_val('feedID', feedID)
   outstr += xmlh.output_val('createdDateTime', xmlh.current_ts())
   outstr += xmlh.output_val('providerURL', providerURL)
   outstr += xmlh.output_val('description', feedDescription)
   # TODO: capture ts -- use now?!
   outstr += '</FeedInfo>'
 
   # hardcoded: Organization
   outstr += '<Organizations>'
   for key in ORGS:
     outstr += ORGS[key]
     numorgs += 1
   outstr += '</Organizations>'
   outstr += '<VolunteerOpportunities>'
   outstr += volopps
   outstr += '</VolunteerOpportunities>'
   outstr += '</FootprintFeed>'
 
   #outstr = re.sub(r'><([^/])', r'>\n<\1', outstr)
   return outstr, numorgs, numopps

コード例 #9

0

ファイルを表示

                             'sensor' : 'false', 
                             'clientID' : CLIENT_ID})
  try:
    maps_fh = urllib2.urlopen("http://maps.google.com/maps/geo?%s" % params)
    res = maps_fh.read()
    maps_fh.close()
  except IOError, err:
    print_debug("geocode_call: Error contacting Maps API. Sleeping. " + str(err))
    time.sleep(1)
    return geocode_call(query, retries - 1)

  #print_debug("response length: "+str(len(res)))
  if re.search(r'403 Forbidden', res):
    respcode = 403
  else:
    node = xmlh.simple_parser(res, [], False)
    respcode = xmlh.get_tag_val(node, "code")
    if respcode == "":
      #print_debug("unparseable response: "+res)
      return False

  respcode = int(respcode)
  if respcode in (400, 601, 602, 603):  # problem with the query
    return None

  if respcode in (403, 500, 620):  # problem with the server
    print_debug("geocode_call: Connection problem or quota exceeded.  Sleeping...")
    if retries == 4:
      xmlh.print_progress("geocoder: %d" % respcode, "", SHOW_PROGRESS)
    time.sleep(5)
    return geocode_call(query, retries - 1)

コード例 #10

0

ファイルを表示

def parse(instr, maxrecs, progress):
    """return FPXML given usaservice data"""
    # TODO: progress
    known_elnames = [
        'channel',
        'db:abstract',
        'db:address',
        'db:attendee_count',
        'db:categories',
        'db:city',
        'db:country',
        'db:county',
        'db:dateTime',
        'db:event',
        'db:eventType',
        'db:guest_total',
        'db:host',
        'db:latitude',
        'db:length',
        'db:longitude',
        'db:rsvp',
        'db:scheduledTime',
        'db:state',
        'db:street',
        'db:title',
        'db:venue_name',
        'db:zipcode',
        'description',
        'docs',
        'guid',
        'item',
        'language',
        'link',
        'pubDate',
        'rss',
        'title',
    ]

    # convert to footprint format
    s = '<?xml version="1.0" ?>'
    s += '<FootprintFeed schemaVersion="0.1">'
    s += '<FeedInfo>'
    # TODO: assign provider IDs?
    s += '<providerID>101</providerID>'
    s += '<providerName>usaservice.org</providerName>'
    s += '<feedID>1</feedID>'
    s += '<createdDateTime>%s</createdDateTime>' % xmlh.current_ts()
    s += '<providerURL>http://www.usaservice.org/</providerURL>'
    s += '<description>Syndicated events</description>'
    # TODO: capture ts -- use now?!
    s += '</FeedInfo>'

    numorgs = numopps = 0
    # hardcoded: Organization
    s += '<Organizations>'
    s += '<Organization>'
    s += '<organizationID>0</organizationID>'
    s += '<nationalEIN></nationalEIN>'
    s += '<name></name>'
    s += '<missionStatement></missionStatement>'
    s += '<description></description>'
    s += '<location><city></city><region></region><postalCode></postalCode></location>'
    s += '<organizationURL></organizationURL>'
    s += '<donateURL></donateURL>'
    s += '<logoURL></logoURL>'
    s += '<detailURL></detailURL>'
    s += '</Organization>'
    numorgs += 1
    s += '</Organizations>'

    s += '<VolunteerOpportunities>'

    instr = re.sub(r'<(/?db):', r'<\1_', instr)
    for i, line in enumerate(instr.splitlines()):
        if (maxrecs > 0 and i > maxrecs):
            break
        xmlh.print_rps_progress("opps", progress, i, maxrecs)
        item = xmlh.simple_parser(line, known_elnames, progress=False)

        # unmapped: db_rsvp  (seems to be same as link, but with #rsvp at end of url?)
        # unmapped: db_host  (no equivalent?)
        # unmapped: db_county  (seems to be empty)
        # unmapped: attendee_count
        # unmapped: guest_total
        # unmapped: db_title   (dup of title, above)
        s += '<VolunteerOpportunity>'
        s += '<volunteerOpportunityID>%s</volunteerOpportunityID>' % (
            xmlh.get_tag_val(item, "guid"))
        # hardcoded: sponsoringOrganizationID
        s += '<sponsoringOrganizationIDs><sponsoringOrganizationID>0</sponsoringOrganizationID></sponsoringOrganizationIDs>'
        # hardcoded: volunteerHubOrganizationID
        s += '<volunteerHubOrganizationIDs><volunteerHubOrganizationID>0</volunteerHubOrganizationID></volunteerHubOrganizationIDs>'
        s += '<title>%s</title>' % (xmlh.get_tag_val(item, "title"))
        s += '<abstract>%s</abstract>' % (xmlh.get_tag_val(item, "abstract"))
        s += '<volunteersNeeded>-8888</volunteersNeeded>'

        dbscheduledTimes = item.getElementsByTagName("db_scheduledTime")
        if (dbscheduledTimes.length != 1):
            print datetime.now(
            ), "parse_usaservice: only 1 db_scheduledTime supported."
            return None
        dbscheduledTime = dbscheduledTimes[0]
        s += '<dateTimeDurations><dateTimeDuration>'
        length = xmlh.get_tag_val(dbscheduledTime, "db_length")
        if length == "" or length == "-1":
            s += '<openEnded>Yes</openEnded>'
        else:
            s += '<openEnded>No</openEnded>'
        date, time = xmlh.get_tag_val(dbscheduledTime,
                                      "db_dateTime").split(" ")
        s += '<startDate>%s</startDate>' % (date)
        # TODO: timezone???
        s += '<startTime>%s</startTime>' % (time)
        s += '</dateTimeDuration></dateTimeDurations>'

        dbaddresses = item.getElementsByTagName("db_address")
        if (dbaddresses.length != 1):
            print datetime.now(
            ), "parse_usaservice: only 1 db_address supported."
            return None
        dbaddress = dbaddresses[0]
        s += '<locations><location>'
        s += '<name>%s</name>' % (xmlh.get_tag_val(item, "db_venue_name"))
        s += '<streetAddress1>%s</streetAddress1>' % (xmlh.get_tag_val(
            dbaddress, "db_street"))
        s += '<city>%s</city>' % (xmlh.get_tag_val(dbaddress, "db_city"))
        s += '<region>%s</region>' % (xmlh.get_tag_val(dbaddress, "db_state"))
        s += '<country>%s</country>' % (xmlh.get_tag_val(
            dbaddress, "db_country"))
        s += '<postalCode>%s</postalCode>' % (xmlh.get_tag_val(
            dbaddress, "db_zipcode"))
        s += '<latitude>%s</latitude>' % (xmlh.get_tag_val(
            item, "db_latitude"))
        s += '<longitude>%s</longitude>' % (xmlh.get_tag_val(
            item, "db_longitude"))
        s += '</location></locations>'

        type = xmlh.get_tag_val(item, "db_eventType")
        s += '<categoryTags><categoryTag>%s</categoryTag></categoryTags>' % (
            type)

        s += '<contactName>%s</contactName>' % xmlh.get_tag_val(
            item, "db_host")
        s += '<detailURL>%s</detailURL>' % (xmlh.get_tag_val(item, "link"))
        s += '<description>%s</description>' % (xmlh.get_tag_val(
            item, "description"))
        pubdate = xmlh.get_tag_val(item, "pubDate")
        if re.search("[0-9][0-9] [A-Z][a-z][a-z] [0-9][0-9][0-9][0-9]",
                     pubdate):
            # TODO: parse() is ignoring timzone...
            ts = dateutil.parser.parse(pubdate)
            pubdate = ts.strftime("%Y-%m-%dT%H:%M:%S")
        s += '<lastUpdated>%s</lastUpdated>' % (pubdate)
        s += '</VolunteerOpportunity>'
        numopps += 1

    s += '</VolunteerOpportunities>'
    s += '</FootprintFeed>'
    #s = re.sub(r'><([^/])', r'>\n<\1', s)
    return s, numorgs, numopps