Beispiel #1
0
def pipe_itembuilder(context, _INPUT, conf, **kwargs):
    """This source builds an item.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        attrs -- key, value pairs
        
    Yields (_OUTPUT):
    item
    """
    attrs = conf['attrs']
    
    for item in _INPUT:
        d = {}
        for attr in attrs:
            try:
                key = util.get_value(attr['key'], item, **kwargs)
                value = util.get_value(attr['value'], item, **kwargs)
            except KeyError:
                continue  #ignore if the item is referenced but doesn't have our source or target field (todo: issue a warning if debugging?)
            
            util.set_value(d, key, value)
        
        yield d
        
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
            
Beispiel #2
0
def pipe_sort(context, _INPUT, conf, **kwargs):
    """This operator sorts the input source according to the specified key. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        KEY -- list of fields to sort by
    
    Yields (_OUTPUT):
    source items sorted by key
    """
    order = []

    keys = conf['KEY']
    if not isinstance(keys, list):
        keys = [keys]
    for key in keys:
        field = util.get_value(key['field'], None, **kwargs)
        sort_dir = util.get_value(key['dir'], None, **kwargs)
        order.append('%s%s' % (sort_dir == 'DESC' and '-' or '', field))

    #read all and sort
    sorted_input = []
    for item in _INPUT:
        sorted_input.append(item)
    sorted_input = util.multikeysort(sorted_input, order)

    for item in sorted_input:
        yield item
Beispiel #3
0
def pipe_strreplace(context, _INPUT, conf, **kwargs):
    """Replaces text with replacement text.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        RULE -- rules - each rule comprising (find, param, replace):
            find -- text to find
            param -- type of match: 1=first, 2=last, 3=every
            replace -- text to replace with
    
    Yields (_OUTPUT):
    source string with replacements
    """
    rules = []
       
    for rule in conf['RULE']:
        find = util.get_value(rule['find'], None, **kwargs)
        param = util.get_value(rule['param'], None, **kwargs)
        replace = util.get_value(rule['replace'], None, **kwargs)
        rules.append((find, param, replace))

    for item in _INPUT:
        t = item
        for rule in rules:
            if rule[1] == '1':
                t = t.replace(rule[0], rule[2], 1)
            elif rule[1] == '2':
                t = util.rreplace(t, rule[0], rule[2], 1)
            elif rule[1] == '3':
                t = t.replace(rule[0], rule[2])
            #todo else assertion
            
        yield t
Beispiel #4
0
def pipe_itembuilder(context, _INPUT, conf, **kwargs):
    """This source builds an item.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        attrs -- key, value pairs
        
    Yields (_OUTPUT):
    item
    """
    attrs = conf['attrs']

    for item in _INPUT:
        d = {}
        for attr in attrs:
            try:
                key = util.get_value(attr['key'], item, **kwargs)
                value = util.get_value(attr['value'], item, **kwargs)
            except KeyError:
                continue  #ignore if the item is referenced but doesn't have our source or target field (todo: issue a warning if debugging?)

            util.set_value(d, key, value)

        yield d

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #5
0
def pipe_sort(context, _INPUT, conf, **kwargs):
    """This operator sorts the input source according to the specified key. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        KEY -- list of fields to sort by
    
    Yields (_OUTPUT):
    source items sorted by key
    """
    order = []
       
    keys = conf['KEY']
    if not isinstance(keys, list):
        keys = [keys]
    for key in keys:
        field = util.get_value(key['field'], None, **kwargs)
        sort_dir = util.get_value(key['dir'], None, **kwargs)
        order.append('%s%s' % (sort_dir=='DESC' and '-' or '', field))

    #read all and sort
    sorted_input = []
    for item in _INPUT:
        sorted_input.append(item)
    sorted_input = util.multikeysort(sorted_input, order)
            
    for item in sorted_input:
        yield item
        
Beispiel #6
0
def pipe_strregex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (match, replace)
    
    Yields (_OUTPUT):
    source item after replacing values matching regexes
    """
    rules = []
    
    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    
    for rule in rule_defs:
        #TODO compile regex here: c = re.compile(match)
        match = util.get_value(rule['match'], None, **kwargs) #todo use subkey?
        replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey?
        
        #convert regex to Python format: todo use a common routine for this
        replace = re.sub('\$(\d+)', r'\\\1', replace)   #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.
        
        rules.append((match, replace))
    
    for item in _INPUT:
        for rule in rules:
            item = re.sub(match, replace, item)
            
        yield item
Beispiel #7
0
def pipe_regex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (field, match, replace)
    
    Yields (_OUTPUT):
    source items after replacing values matching regexes
    """
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL
        # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html
        match = util.get_value(rule['match'], None,
                               **kwargs)  #todo use subkey?
        matchc = re.compile(
            match, re.DOTALL)  #compile for speed and we need to pass flags
        replace = util.get_value(rule['replace'], None,
                                 **kwargs)  #todo use subkey?
        if replace is None:
            replace = ''

        #convert regex to Python format: todo use a common routine for this
        replace = re.sub(
            '\$(\d+)', r'\\\1', replace
        )  #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.

        rules.append((rule['field']['value'], matchc, replace))

    for item in _INPUT:

        def sub_fields(matchobj):
            return util.get_value({'subkey': matchobj.group(1)}, item)

        for rule in rules:
            #todo: do we ever need get_value here instead of item[]?
            if rule[0] in item and item[rule[0]]:
                util.set_value(
                    item, rule[0],
                    re.sub(rule[1], rule[2], unicode(item[rule[0]])))

                util.set_value(
                    item, rule[0],
                    re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]])))

        yield item
Beispiel #8
0
def pipe_fetchsitefeed(context, _INPUT, conf, **kwargs):
    """This source fetches and parses the first feed found on one or more sites 
       to yield the feed entries.
    
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
    conf:
        URL -- url
    
    Yields (_OUTPUT):
    feed entries
    """
    forever = pipe_forever(context, None, conf=None)
    
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]
            
    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            
            if not '://' in url:
                url = 'http://' + url
            
            if context.verbose:
                print "pipe_fetchsitefeed loading:", url
            
            for feed in pipe_feedautodiscovery(context, forever, {u'URL': {u'type': u'url', u'value': url}}):
                for feed_item in pipe_fetch(context, forever, {u'URL': {u'type': u'url', u'value': feed['link']}}):
                    yield feed_item
                
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #9
0
def pipe_tail(context, _INPUT, conf, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
    conf:
        count -- length of the truncated feed, if specified literally
        
    Yields (_OUTPUT):
    tail-truncated list of source items
    """

    count = conf['count']
    limit = int(util.get_value(count, None, **kwargs))

    try:
        #if python 2.6+ we can use a sliding window and save memory
        from collections import deque
        buffer = deque(_INPUT, limit)
    except:
        buffer = []
    for item in _INPUT:
        buffer.append(item)
    
    #slice [-limit:] in a list/deque compatible way
    for i in xrange(-1, -(min(len(buffer), limit)+1), -1):
        yield buffer[i]
Beispiel #10
0
def pipe_uniq(context, _INPUT, conf, **kwargs):
    """This operator filters out non unique items according to the specified field. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        field -- field to be unique
    
    Yields (_OUTPUT):
    source items, one per unique field value
    """

    field = util.get_value(conf['field'], None, **kwargs)
    order = ['%s%s' % ('', field)]

    #read all and sort
    sorted_input = []
    for item in _INPUT:
        sorted_input.append(item)
    sorted_input = util.multikeysort(sorted_input, order)

    seen = None
    for item in sorted_input:
        #todo: do we ever need get_value here instead of item[]?
        if seen != item[field]:
            yield item
            seen = item[field]
def pipe_feedautodiscovery(context, _INPUT, conf, **kwargs):
    """This source search for feed links in a page
    
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
    conf:
        URL -- url
    
    Yields (_OUTPUT):
    feed entries
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]
    
    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url
            
            if context.verbose:
                print "pipe_feedautodiscovery loading:", url
            d = autorss.getRSSLink(url.encode('utf-8'))
            
            for entry in d:
                yield {'link':entry}
                #todo add rel, type, title
    
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #12
0
def pipe_tail(context, _INPUT, conf, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
    conf:
        count -- length of the truncated feed, if specified literally
        
    Yields (_OUTPUT):
    tail-truncated list of source items
    """

    count = conf['count']
    limit = int(util.get_value(count, None, **kwargs))

    try:
        #if python 2.6+ we can use a sliding window and save memory
        from collections import deque
        buffer = deque(_INPUT, limit)
    except:
        buffer = []
    for item in _INPUT:
        buffer.append(item)

    #slice [-limit:] in a list/deque compatible way
    for i in xrange(-1, -(min(len(buffer), limit) + 1), -1):
        yield buffer[i]
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs):
    """This source search for feed links in a page

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        URL -- url

    Yields (_OUTPUT):
    feed entries
    """
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)

            if context and context.verbose:
                print "pipe_feedautodiscovery loading:", url

            for entry in autorss.getRSSLink(url.encode('utf-8')):
                yield {'link': entry}
                # todo: add rel, type, title

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Beispiel #14
0
def pipe_uniq(context, _INPUT, conf, **kwargs):
    """This operator filters out non unique items according to the specified field. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        field -- field to be unique
    
    Yields (_OUTPUT):
    source items, one per unique field value
    """
       
    field = util.get_value(conf['field'], None, **kwargs)
    order = ['%s%s' % ('', field)]

    #read all and sort
    sorted_input = []
    for item in _INPUT:
        sorted_input.append(item)
    sorted_input = util.multikeysort(sorted_input, order)
            
    seen = None
    for item in sorted_input:
        #todo: do we ever need get_value here instead of item[]?
        v = util.get_subkey(field, item)
        if seen != v:
            yield item
            seen = v
def pipe_subelement(context, _INPUT, conf, **kwargs):
    """Returns a subelement.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        path -- contains the value and type to select
    
    Yields (_OUTPUT):
    subelement of source item
    """
    path = conf['path']
    path['subkey'] = path['value']  #switch to using as a reference
    del path['value']

    for item in _INPUT:
        t = util.get_value(path, item)
        if t:
            if isinstance(t, list):
                for nested_item in t:
                    yield nested_item
            else:
                yield t

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #16
0
def pipe_strconcat(context, _INPUT, conf, **kwargs):
    """This source builds a string.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        part -- parts
    
    Yields (_OUTPUT):
    string
    """
    if not isinstance(conf['part'], list):    #todo do we need to do this anywhere else?
        conf['part'] = [conf['part']]

    for item in _INPUT:
        s = ""
        for part in conf['part']:
            try:
                s += util.get_value(part, item, **kwargs)
            except AttributeError:
                continue  #ignore if the item is referenced but doesn't have our source field (todo: issue a warning if debugging?)
            except TypeError:
                if context.verbose:
                    print "pipe_strconcat: TypeError"
    
        yield s
def pipe_rssitembuilder(context, _INPUT, conf, **kwargs):
    """This source builds an rss item.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        dictionary of key/values
    Yields (_OUTPUT):
    item
    """
    
    for item in _INPUT:
        d = {}
        
        for key in conf:
            try:
                value = util.get_value(conf[key], item, **kwargs)  #todo really dereference item? (sample pipe seems to suggest so: surprising)
            except KeyError:
                continue  #ignore if the source doesn't have our source field (todo: issue a warning if debugging?)
            
            key = map_key_to_rss.get(key, key)
            
            if value:
                if key == 'title':
                    util.set_value(d, 'y:%s' % key, value)
                #todo also for guid -> y:id (is guid the only one?)

                #todo try/except?
                util.set_value(d, key, value)
        
        yield d
        
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #18
0
def pipe_subelement(context, _INPUT, conf, **kwargs):
    """Returns a subelement.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        path -- contains the value and type to select
    
    Yields (_OUTPUT):
    subelement of source item
    """
    path = conf['path']
    path['subkey'] = path['value']  #switch to using as a reference
    del path['value']

    for item in _INPUT:
        t = util.get_value(path, item)
        if t:
            if isinstance(t, list):
                for nested_item in t:
                    yield nested_item
            else:
                yield t
            
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break        
Beispiel #19
0
def pipe_dateformat(context, _INPUT, conf, **kwargs):
    """This source formats a date.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        format -- date format
    
    Yields (_OUTPUT):
    formatted date
    """
    date_format = util.get_value(conf['format'], None, **kwargs)

    for item in _INPUT:
        s = item
        if isinstance(s, basestring):
            for df in util.ALTERNATIVE_DATE_FORMATS:
                try:
                    s = datetime.strptime(s, df).timetuple()
                    break
                except:
                    pass
            else:
                #todo: raise an exception: unexpected date format
                pass
        s = time.strftime(date_format, s)   #todo check all PHP formats are covered by Python
        #todo silent error handling? e.g. if item is not a date
        
        yield s
Beispiel #20
0
def pipe_fetch(context=None, _INPUT=None, conf=None, **kwargs):
    """Fetches and parses one or more feeds to yield the feed entries.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        URL -- url

    Yields (_OUTPUT):
    feed entries
    """
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)

            if not url:
                continue

            if context and context.verbose:
                print "pipe_fetch loading:", url

            parsed = feedparser.parse(urlopen(url).read())

            for entry in util.gen_entries(parsed):
                yield entry

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Beispiel #21
0
def pipe_rename(context, _INPUT, conf, **kwargs):
    """This operator renames or copies fields in the input source. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (op, field, newval)
    
    Yields (_OUTPUT):
    source items after copying/renaming
    """
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        newval = util.get_value(rule['newval'], None,
                                **kwargs)  #todo use subkey?
        newfield = rule['field']
        #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value)
        newfield['subkey'] = newfield['value']
        del newfield['value']

        rules.append((rule['op']['value'], newfield, newval))

    for item in _INPUT:
        for rule in rules:
            try:
                value = util.get_value(
                    rule[1], item,
                    **kwargs)  #forces an exception if any part is not found
                util.set_value(item, rule[2], value)
                if rule[0] == 'rename':
                    try:
                        util.del_value(item, rule[1]['subkey'])
                    except (
                            KeyError, TypeError
                    ):  #TypeError catches pseudo subkeys, e.g. summary.content
                        pass  #ignore if the target doesn't have our field (todo: issue a warning if debugging?)
            except AttributeError:
                pass  #ignore if the source doesn't have our field (todo: issue a warning if debugging?)
        yield item
Beispiel #22
0
def pipe_regex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (field, match, replace)
    
    Yields (_OUTPUT):
    source items after replacing values matching regexes
    """
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    
    for rule in rule_defs:
        #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL
        # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html
        match = util.get_value(rule['match'], None, **kwargs) #todo use subkey?
        matchc = re.compile(match, re.DOTALL)  #compile for speed and we need to pass flags
        replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey?
        if replace is None:
            replace = ''
        
        #convert regex to Python format: todo use a common routine for this
        replace = re.sub('\$(\d+)', r'\\\1', replace)   #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.

        rules.append((rule['field']['value'], matchc, replace))
            
    for item in _INPUT:
        def sub_fields(matchobj):
            return util.get_value({'subkey':matchobj.group(1)}, item)
            
        for rule in rules:
            #todo: do we ever need get_value here instead of item[]?
            if rule[0] in item and item[rule[0]]:
                util.set_value(item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]])))
    
                util.set_value(item, rule[0], re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]])))
            
        yield item
Beispiel #23
0
def pipe_csv(context, _INPUT, conf, **kwargs):
    """This source fetches and parses a csv file to yield items.
    
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
    conf:
        URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s), custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator
    
    Yields (_OUTPUT):
    file entries
    
    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined in the header
    """
    col_name = conf['col_name']

    for item in _INPUT:
        url = util.get_value(conf['URL'], item, **kwargs)
        separator = util.get_value(conf['separator'], item,
                                   **kwargs).encode('utf-8')
        skip = int(util.get_value(conf['skip'], item, **kwargs))
        col_mode = util.get_value(conf['col_mode'], item, **kwargs)
        col_row_start = int(
            util.get_value(conf['col_row_start'], item, **kwargs))
        col_row_end = int(util.get_value(conf['col_row_end'], item, **kwargs))

        f = urllib2.urlopen(url)

        if context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = UnicodeReader(f, delimiter=separator)

        fieldnames = []
        if col_mode == 'custom':
            fieldnames = [util.get_value(x) for x in col_name]
        else:
            for row in xrange((col_row_end - col_row_start) + 1):
                row = reader.next()
                fieldnames.extend(row)

        for row in reader:
            d = dict(zip(fieldnames, row))
            yield d

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #24
0
def pipe_urlbuilder(context, _INPUT, conf, **kwargs):
    """This source builds a url and yields it forever.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        BASE -- base
        PATH -- path elements
        PARAM -- query parameters
    
    Yields (_OUTPUT):
    url
    """

    for item in _INPUT:
        #note: we could cache get_value results if item==True
        url = util.get_value(conf['BASE'], item, **kwargs)
        if not url.endswith('/'):
            url += '/'

        if 'PATH' in conf:
            path = conf['PATH']
            if not isinstance(path, list):
                path = [path]
            path = [util.get_value(p, item, **kwargs) for p in path if p]

            url += "/".join(p for p in path if p)
        url = url.rstrip("/")

        #Ensure url is valid
        url = util.url_quote(url)

        param_defs = conf['PARAM']
        if not isinstance(param_defs, list):
            param_defs = [param_defs]

        params = dict([(util.get_value(p['key'], item, **kwargs),
                        util.get_value(p['value'], item, **kwargs))
                       for p in param_defs if p])
        if params and params.keys() != [u'']:
            url += "?" + urllib.urlencode(params)

        yield url
Beispiel #25
0
def pipe_csv(context, _INPUT, conf, **kwargs):
    """This source fetches and parses a csv file to yield items.
    
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
    conf:
        URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s), custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator
    
    Yields (_OUTPUT):
    file entries
    
    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined in the header
    """
    col_name = conf['col_name']
        
    for item in _INPUT:
        url = util.get_value(conf['URL'], item, **kwargs)
        separator = util.get_value(conf['separator'], item, **kwargs).encode('utf-8')
        skip = int(util.get_value(conf['skip'], item, **kwargs))
        col_mode = util.get_value(conf['col_mode'], item, **kwargs)
        col_row_start = int(util.get_value(conf['col_row_start'], item, **kwargs))
        col_row_end = int(util.get_value(conf['col_row_end'], item, **kwargs))
        
        f = urllib2.urlopen(url)
        
        if context.verbose:
            print "pipe_csv loading:", url
            
        for i in xrange(skip):
            f.next()
        
        reader = UnicodeReader(f, delimiter=separator)
            
        fieldnames = []
        if col_mode == 'custom':
            fieldnames = [util.get_value(x) for x in col_name]
        else:
            for row in xrange((col_row_end - col_row_start) +1):
                row = reader.next()
                fieldnames.extend(row)

        for row in reader:
            d = dict(zip(fieldnames, row))
            yield d
            
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #26
0
def pipe_urlbuilder(context, _INPUT, conf, **kwargs):
    """This source builds a url and yields it forever.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        BASE -- base
        PATH -- path elements
        PARAM -- query parameters
    
    Yields (_OUTPUT):
    url
    """

    for item in _INPUT:
        #note: we could cache get_value results if item==True
        url = util.get_value(conf['BASE'], item, **kwargs)
        if not url.endswith('/'):
            url += '/'

        if 'PATH' in conf: 
            path = conf['PATH']
            if not isinstance(path, list):
                path = [path]
            path = [util.get_value(p, item, **kwargs) for p in path if p]

            url += "/".join(str(p) for p in path if p)
        url = url.rstrip("/")

        #Ensure url is valid
        url = util.url_quote(url)

        param_defs = conf['PARAM']
        if not isinstance(param_defs, list):
            param_defs = [param_defs]

        params = dict([(util.get_value(p['key'], item, **kwargs), util.get_value(p['value'], item, **kwargs)) for p in param_defs if p])
        if params and params.keys() != [u'']:
            url += "?" + urllib.urlencode(params)

        yield url
Beispiel #27
0
def pipe_simplemath(context, _INPUT, conf, **kwargs):
    """This operator performs basic arithmetic, such as addition and subtraction.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other value, if wired in
    conf:
        other -- input value
        op -- operator
        
    Yields (_OUTPUT):
    result
    """

    value = float(util.get_value(conf['OTHER'], None, **kwargs))
    op = util.get_value(conf['OP'], None, **kwargs)

    for item in _INPUT:
        yield OPS[op](float(item), value)
Beispiel #28
0
def pipe_simplemath(context, _INPUT, conf, **kwargs):
    """This operator performs basic arithmetic, such as addition and subtraction.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other value, if wired in
    conf:
        other -- input value
        op -- operator
        
    Yields (_OUTPUT):
    result
    """

    value = float(util.get_value(conf['OTHER'], None, **kwargs))
    op = util.get_value(conf['OP'], None, **kwargs)

    for item in _INPUT:
        yield OPS[op](float(item), value)
Beispiel #29
0
def pipe_rename(context, _INPUT, conf, **kwargs):
    """This operator renames or copies fields in the input source. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (op, field, newval)
    
    Yields (_OUTPUT):
    source items after copying/renaming
    """
    rules = []
    
    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
       
    for rule in rule_defs:
        newval = util.get_value(rule['newval'], None, **kwargs) #todo use subkey?
        newfield = rule['field']
        #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value)        
        newfield['subkey'] = newfield['value']
        del newfield['value']
        
        rules.append((rule['op']['value'], newfield, newval))
    
    for item in _INPUT:
        for rule in rules:
            try:
                value = util.get_value(rule[1], item, **kwargs) #forces an exception if any part is not found
                util.set_value(item, rule[2], value)
                if rule[0] == 'rename':
                    try:
                        util.del_value(item, rule[1]['subkey'])
                    except KeyError:
                        pass  #ignore if the target doesn't have our field (todo: issue a warning if debugging?)
            except AttributeError:
                pass  #ignore if the source doesn't have our field (todo: issue a warning if debugging?)
        yield item
Beispiel #30
0
def pipe_substr(context, _INPUT, conf, **kwargs):
    """Returns a substring.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        from -- starting character
        length -- number of characters to return
    
    Yields (_OUTPUT):
    portion of source string
    """
    sfrom = int(util.get_value(conf['from'], None, **kwargs))
    length = int(util.get_value(conf['length'], None, **kwargs))

    for item in _INPUT:
        yield item[sfrom:sfrom+length]

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break        
Beispiel #31
0
def pipe_substr(context, _INPUT, conf, **kwargs):
    """Returns a substring.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        from -- starting character
        length -- number of characters to return
    
    Yields (_OUTPUT):
    portion of source string
    """
    sfrom = int(util.get_value(conf['from'], None, **kwargs))
    length = int(util.get_value(conf['length'], None, **kwargs))

    for item in _INPUT:
        yield item[sfrom:sfrom + length]

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #32
0
def pipe_strregex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (match, replace)
    
    Yields (_OUTPUT):
    source item after replacing values matching regexes
    """
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        #TODO compile regex here: c = re.compile(match)
        match = util.get_value(rule['match'], None,
                               **kwargs)  #todo use subkey?
        replace = util.get_value(rule['replace'], None,
                                 **kwargs)  #todo use subkey?

        #convert regex to Python format: todo use a common routine for this
        replace = re.sub(
            '\$(\d+)', r'\\\1', replace
        )  #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.
        if replace is None:
            replace = ''

        rules.append((match, replace))

    for item in _INPUT:
        for rule in rules:
            item = re.sub(match, replace, item)

        yield item
Beispiel #33
0
def pipe_strreplace(context, _INPUT, conf, **kwargs):
    """Replaces text with replacement text.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        RULE -- rules - each rule comprising (find, param, replace):
            find -- text to find
            param -- type of match: 1=first, 2=last, 3=every
            replace -- text to replace with
    
    Yields (_OUTPUT):
    source string with replacements
    """
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        find = util.get_value(rule['find'], None, **kwargs)
        param = util.get_value(rule['param'], None, **kwargs)
        replace = util.get_value(rule['replace'], None, **kwargs)
        rules.append((find, param, replace))

    for item in _INPUT:
        t = item
        for rule in rules:
            if rule[1] == '1':
                t = t.replace(rule[0], rule[2], 1)
            elif rule[1] == '2':
                t = util.rreplace(t, rule[0], rule[2], 1)
            elif rule[1] == '3':
                t = t.replace(rule[0], rule[2])
            #todo else assertion

        yield t
Beispiel #34
0
def pipe_fetch(context, _INPUT, conf, **kwargs):
    """This source fetches and parses one or more feeds to yield the feed entries.
    
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
    conf:
        URL -- url
    
    Yields (_OUTPUT):
    feed entries
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url

            if context.verbose:
                print "pipe_fetch loading:", url
            d = feedparser.parse(url.encode('utf-8'))

            for entry in d['entries']:
                if 'updated_parsed' in entry:
                    entry['pubDate'] = entry[
                        'updated_parsed']  #map from universal feedparser's normalised names
                    entry['y:published'] = entry[
                        'updated_parsed']  #yahoo's own version
                if 'author' in entry:
                    entry['dc:creator'] = entry['author']
                if 'author_detail' in entry:
                    if 'href' in entry['author_detail']:
                        entry['author.uri'] = entry['author_detail']['href']
                    if 'name' in entry['author_detail']:
                        entry['author.name'] = entry['author_detail']['name']
                #todo more!?
                if 'title' in entry:
                    entry['y:title'] = entry['title']  #yahoo's own versions
                if 'id' in entry:
                    entry['y:id'] = entry['id']  #yahoo's own versions
                #todo more!?
                yield entry

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #35
0
def pipe_fetch(context, _INPUT, conf, **kwargs):
    """This source fetches and parses one or more feeds to yield the feed entries.
    
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
    conf:
        URL -- url
    
    Yields (_OUTPUT):
    feed entries
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]
    
    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            
            if not '://' in url:
                url = 'http://' + url
            
            if context.verbose:
                print "pipe_fetch loading:", url
            d = feedparser.parse(url.encode('utf-8'))
            
            for entry in d['entries']:
                if 'updated_parsed' in entry:
                    entry['pubDate'] = entry['updated_parsed']  #map from universal feedparser's normalised names
                    entry['y:published'] = entry['updated_parsed']  #yahoo's own version
                if 'author' in entry:
                    entry['dc:creator'] = entry['author']
                if 'author_detail' in entry:
                    if 'href' in entry['author_detail']:
                        entry['author.uri'] = entry['author_detail']['href']
                    if 'name' in entry['author_detail']:
                        entry['author.name'] = entry['author_detail']['name']
                #todo more!?
                if 'title' in entry:
                    entry['y:title'] = entry['title']  #yahoo's own versions
                if 'id' in entry:
                    entry['y:id'] = entry['id']  #yahoo's own versions
                #todo more!?
                yield entry

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #36
0
def pipe_datebuilder(context, _INPUT, conf, **kwargs):
    """This source builds a date and yields it forever.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- XXX
    conf:
        DATE -- date
    
    Yields (_OUTPUT):
    date
    """
    for item in _INPUT:
        date = util.get_value(conf['DATE'], item, **kwargs)
        try:
            date = float(date)
        except ValueError: pass
        if type(date) == float or type(date) == int:
            date = datetime.utcfromtimestamp(date)
        else:
            date = str(date).lower()
            if date.endswith(' day') or date.endswith(' days'):
                count = int(date.split(' ')[0])
                date = (datetime.utcnow() + timedelta(days=count))
            elif date.endswith(' year') or date.endswith(' years'):
                count = int(date.split(' ')[0])
                date = datetime.utcnow()
                date = date.replace(year = date.year + count)
            elif date == 'today':
                date = datetime.utcnow()
            elif date == 'tomorrow':
                date = (datetime.utcnow() + timedelta(days=1))
            elif date == 'yesterday':
                date = (datetime.utcnow() + timedelta(days=-1))
            elif date == 'now':  #todo is this allowed by Yahoo?
                date = datetime.utcnow()
            else:
                for df in util.ALTERNATIVE_DATE_FORMATS:
                    try:
                        date = datetime.strptime(date, df)
                        break
                    except:
                        pass
                else:
                    #todo: raise an exception: unexpected date format
                    pass
            
        yield date
Beispiel #37
0
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs):
    """This source issues YQL queries.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        yqlquery -- YQL query
        # todo: handle envURL

    Yields (_OUTPUT):
    query results
    """
    # todo: get from a config/env file
    url = "http://query.yahooapis.com/v1/public/yql"
    conf = DotDict(conf)
    query = conf['yqlquery']

    for item in _INPUT:
        item = DotDict(item)
        yql = util.get_value(query, item, **kwargs)

        # note: we use the default format of xml since json loses some
        # structure
        # todo: diagnostics=true e.g. if context.test
        # todo: consider paging for large result sets
        r = requests.get(url, params={'q': yql}, stream=True)

        # Parse the response
        tree = parse(r.raw)

        if context and context.verbose:
            print "pipe_yql loading xml:", yql

        root = tree.getroot()

        # note: query also has row count
        results = root.find('results')

        # Convert xml into generation of dicts
        for element in results.getchildren():
            yield util.etree_to_dict(element)

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Beispiel #38
0
def pipe_truncate(context, _INPUT, conf, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
    conf:
        count -- length of the truncated feed, if specified literally
        
    Yields (_OUTPUT):
    truncated list of source items
    """

    count = conf['count']
    limit = int(util.get_value(count, None, **kwargs))
    for i in xrange(0, limit):
        yield _INPUT.next()
Beispiel #39
0
def pipe_truncate(context, _INPUT, conf, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
    conf:
        count -- length of the truncated feed, if specified literally
        
    Yields (_OUTPUT):
    truncated list of source items
    """

    count = conf['count']
    limit = int(util.get_value(count, None, **kwargs))
    for i in xrange(0, limit):
        yield _INPUT.next()
Beispiel #40
0
def pipe_datebuilder(context, _INPUT, conf, **kwargs):
    """This source builds a date and yields it forever.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- XXX
    conf:
        DATE -- date
    
    Yields (_OUTPUT):
    date
    """
    for item in _INPUT:
        date = util.get_value(conf['DATE'], item, **kwargs).lower()

        if date.endswith(' day') or date.endswith(' days'):
            count = int(date.split(' ')[0])
            date = (datetime.today() + timedelta(days=count)).timetuple()
        elif date.endswith(' year') or date.endswith(' years'):
            count = int(date.split(' ')[0])
            date = datetime.today().replace(year=datetime.today().year +
                                            count).timetuple()
        elif date == 'today':
            date = datetime.today().timetuple()
        elif date == 'tomorrow':
            date = (datetime.today() + timedelta(days=1)).timetuple()
        elif date == 'yesterday':
            date = (datetime.today() + timedelta(days=-1)).timetuple()
        elif date == 'now':  #todo is this allowed by Yahoo?
            date = datetime.now().timetuple()  #better to use utcnow?
        else:
            for df in util.ALTERNATIVE_DATE_FORMATS:
                try:
                    date = datetime.strptime(date, df).timetuple()
                    break
                except:
                    pass
            else:
                #todo: raise an exception: unexpected date format
                pass

        yield date
Beispiel #41
0
def pipe_stringtokenizer(context, _INPUT, conf, **kwargs):
    """Splits a string into tokens delimited by separators.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        to-str -- separator string
    
    Yields (_OUTPUT):
    tokens of the input string
    """
    delim = util.get_value(conf['to-str'], None, **kwargs)

    for item in _INPUT:
        if item is not None:
            yield item.split(delim)

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break        
Beispiel #42
0
def pipe_datebuilder(context, _INPUT, conf, **kwargs):
    """This source builds a date and yields it forever.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- XXX
    conf:
        DATE -- date
    
    Yields (_OUTPUT):
    date
    """
    for item in _INPUT:
        date = util.get_value(conf["DATE"], item, **kwargs).lower()

        if date.endswith(" day") or date.endswith(" days"):
            count = int(date.split(" ")[0])
            date = (datetime.today() + timedelta(days=count)).timetuple()
        elif date.endswith(" year") or date.endswith(" years"):
            count = int(date.split(" ")[0])
            date = datetime.today().replace(year=datetime.today().year + count).timetuple()
        elif date == "today":
            date = datetime.today().timetuple()
        elif date == "tomorrow":
            date = (datetime.today() + timedelta(days=1)).timetuple()
        elif date == "yesterday":
            date = (datetime.today() + timedelta(days=-1)).timetuple()
        elif date == "now":  # todo is this allowed by Yahoo?
            date = datetime.now().timetuple()  # better to use utcnow?
        else:
            for df in util.ALTERNATIVE_DATE_FORMATS:
                try:
                    date = datetime.strptime(date, df).timetuple()
                    break
                except:
                    pass
            else:
                # todo: raise an exception: unexpected date format
                pass

        yield date
Beispiel #43
0
def pipe_yql(context, _INPUT, conf, **kwargs):
    """This source issues YQL queries.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        yqlquery -- YQL query
        #todo handle envURL
    
    Yields (_OUTPUT):
    query results
    """
    url = "http://query.yahooapis.com/v1/public/yql"  #todo get from a config/env file

    for item in _INPUT:
        yql = util.get_value(conf['yqlquery'], item, **kwargs)

        query = urllib.urlencode({
            'q': yql,
            #note: we use the default format of xml since json loses some structure
            #todo diagnostics=true e.g. if context.test
            #todo consider paging for large result sets
        })
        req = urllib2.Request(url, query)
        response = urllib2.urlopen(req)

        #Parse the response
        ft = ElementTree.parse(response)
        if context.verbose:
            print "pipe_yql loading xml:", yql
        root = ft.getroot()
        #note: query also has row count
        results = root.find('results')
        #Convert xml into generation of dicts
        for element in results.getchildren():
            i = util.xml_to_dict(element)
            yield i

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
def pipe_stringtokenizer(context, _INPUT, conf, **kwargs):
    """Splits a string into tokens delimited by separators.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        to-str -- separator string
    
    Yields (_OUTPUT):
    tokens of the input string
    """
    delim = util.get_value(conf['to-str'], None, **kwargs)

    for item in _INPUT:
        if item is not None:
            for chunk in item.split(delim):
                yield {'content':chunk}

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break        
Beispiel #45
0
def pipe_yql(context, _INPUT, conf,  **kwargs):
    """This source issues YQL queries.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        yqlquery -- YQL query
        #todo handle envURL
    
    Yields (_OUTPUT):
    query results
    """
    url = "http://query.yahooapis.com/v1/public/yql" #todo get from a config/env file
    
    for item in _INPUT:
        yql = util.get_value(conf['yqlquery'], item, **kwargs)
        
        query = urllib.urlencode({'q':yql,
                                  #note: we use the default format of xml since json loses some structure
                                  #todo diagnostics=true e.g. if context.test
                                  #todo consider paging for large result sets
                                 })
        req = urllib2.Request(url, query)    
        response = urllib2.urlopen(req)    
        
        #Parse the response
        ft = ElementTree.parse(response)
        if context.verbose:
            print "pipe_yql loading xml:", yql
        root = ft.getroot()
        #note: query also has row count
        results = root.find('results')
        #Convert xml into generation of dicts
        for element in results.getchildren():
            i = util.xml_to_dict(element)
            yield i
    
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #46
0
def pipe_truncate(context, _INPUT, conf, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
    conf:
        count -- length of the truncated feed, if specified literally
        
    Yields (_OUTPUT):
    truncated list of source items
    """

    count = conf["count"]
    limit = int(util.get_value(count, None, **kwargs))
    i = 0
    for item in _INPUT:
        if i >= limit:
            break
        yield item
        i += 1
Beispiel #47
0
def pipe_filter(context, _INPUT, conf, **kwargs):
    """This operator filters the input source, including or excluding fields, that match a set of defined rules. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        MODE -- filter mode, either "permit" or "block"
        COMBINE -- filter boolean combination, either "and" or "or"
        RULE -- rules - each rule comprising (field, op, value)
    
    Yields (_OUTPUT):
    source items that match the rules
    """
    mode = conf['MODE']['value']
    combine = conf['COMBINE']['value']
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        field = rule['field']['value']
        value = util.get_value(rule['value'], None,
                               **kwargs)  #todo use subkey?
        rules.append((field, rule['op']['value'], value))

    for item in _INPUT:
        if combine in COMBINE_BOOLEAN:
            res = COMBINE_BOOLEAN[combine](_rulepass(rule, item)
                                           for rule in rules)
        else:
            raise Exception("Invalid combine %s (expecting and or or)" %
                            combine)

        if (res and mode == "permit") or (not res and mode == "block"):
            yield item
Beispiel #48
0
def _convert_item(rules, item, **kwargs):
    for rule in rules:
        value = util.get_value(rule[1], item, **kwargs)

        try:
            # forces an exception if any part is not found
            item.set(rule[2], value)
        except AttributeError:
            # ignore if the source doesn't have our field
            # todo: issue a warning if debugging?
            pass

        if rule[0] == 'rename':
            try:
                item.delete(rule[1]['subkey'])
            # TypeError catches pseudo subkeys, e.g. summary.content
            except (KeyError, TypeError):
                # ignore if the target doesn't have our field
                # todo: issue a warning if debugging?
                pass

    return item
Beispiel #49
0
def pipe_filter(context, _INPUT, conf, **kwargs):
    """This operator filters the input source, including or excluding fields, that match a set of defined rules. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        MODE -- filter mode, either "permit" or "block"
        COMBINE -- filter boolean combination, either "and" or "or"
        RULE -- rules - each rule comprising (field, op, value)
    
    Yields (_OUTPUT):
    source items that match the rules
    """
    mode = conf['MODE']['value']
    combine = conf['COMBINE']['value']
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    
    for rule in rule_defs:
        field = rule['field']['value']
        value = util.get_value(rule['value'], None, **kwargs) #todo use subkey?
        rules.append((field, rule['op']['value'], value))

    for item in _INPUT:
        if item == True:
            break
        if combine in COMBINE_BOOLEAN: 
            res = COMBINE_BOOLEAN[combine](_rulepass(rule, item) for rule in rules)
        else:
            raise Exception("Invalid combine %s (expecting and or or)" % combine)

        if (res and mode == "permit") or (not res and mode == "block"):
            yield item
Beispiel #50
0
def pipe_fetchdata(context, _INPUT, conf, **kwargs):
    """This source fetches and parses any XML or JSON file (todo iCal or KML) to yield a list of elements.
    
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        URL -- url
        path -- path to list
    
    Yields (_OUTPUT):
    elements
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url
            path = util.get_value(conf['path'], item, **kwargs)
            match = None

            #Parse the file into a dictionary
            try:
                f = urllib2.urlopen(url)
                ft = ElementTree.parse(f)
                if context.verbose:
                    print "pipe_fetchdata loading xml:", url
                root = ft.getroot()
                #Move to the point referenced by the path
                #todo lxml would simplify and speed up this
                if path:
                    if root.tag[0] == '{':
                        namespace = root.tag[1:].split("}")[0]
                        for i in path.split(".")[:-1]:
                            root = root.find("{%s}%s" % (namespace, i))
                            if root is None:
                                return
                        match = "{%s}%s" % (namespace, path.split(".")[-1])
                    else:
                        match = "%s" % (path.split(".")[-1])
                #Convert xml into generation of dicts
                if match:
                    for element in root.findall(match):
                        i = util.etree_to_pipes(element)
                        yield i
                else:
                    i = util.etree_to_pipes(root)
                    yield i

            except Exception, e:
                try:
                    f = urllib2.urlopen(url)
                    d = json.load(f)
                    #todo test:-
                    if context.verbose:
                        print "pipe_fetchdata loading json:", url
                    if path:
                        for i in path.split(".")[:-1]:
                            d = d.get(i)
                        match = path.split(".")[-1]
                    if match:
                        for itemd in d:
                            if not match or itemd == match:
                                if isinstance(d[itemd], list):
                                    for nested_item in d[itemd]:
                                        yield nested_item
                                else:
                                    yield [d[itemd]]
                    else:
                        yield d
                except Exception, e:
                    #todo try iCal and yield
                    #todo try KML and yield
                    if context.verbose:
                        print "xml and json both failed:"

                    raise
Beispiel #51
0
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to (new or existing)
        loop_with -- pass a particular field into the submodule rather than the whole item
    embed -- embedded submodule
    
    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    """
    mode = conf['mode']['value']
    assign_to = conf['assign_to']['value']
    assign_part = conf['assign_part']['value']
    emit_part = conf['emit_part']['value']
    loop_with = conf['with']['value']
    embed_conf = conf['embed']['value']['conf']

    #Prepare the submodule to take parameters from the loop instead of from the user
    embed_context = copy.copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        if loop_with:
            inp = item[loop_with]  #todo: get_value here?
        else:
            inp = item

        #Pass any input parameters into the submodule
        embed_context.inputs = {}
        for k in embed_conf:
            embed_context.inputs[k] = unicode(
                util.get_value(embed_conf[k], item))
        p = embed(embed_context, [inp], embed_conf)  #prepare the submodule

        results = None
        try:
            #loop over the submodule, emitting as we go or collecting results for later assignment
            for i in p:
                if assign_part == 'first':
                    if mode == 'EMIT':
                        yield i
                    else:
                        results = i
                    break
                else:  #all
                    if mode == 'EMIT':
                        yield i
                    else:
                        if results:
                            results.append(i)
                        else:
                            results = [i]
        except HTTPError:  #todo any other errors we want to continue looping after?
            if context.verbose:
                print "Submodule gave HTTPError - continuing the loop"
            continue

        if mode == 'assign':
            if results and len(results) == 1:
                results = results[0]
            util.set_value(item, assign_to, results)
            yield item
        elif mode == 'EMIT':
            pass  #already yielded
        else:
            raise Exception("Invalid mode %s (expecting assign or EMIT)" %
                            mode)
Beispiel #52
0
 def sub_fields(matchobj):
     return util.get_value({'subkey': matchobj.group(1)}, item)
Beispiel #53
0
def pipe_fetchpage(context, _INPUT, conf, **kwargs):
    """Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

       TODOS:
        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "FetchPage: Preparing to download:", url

            try:
                request = urllib2.Request(url)
                request.add_header('User-Agent', 'Yahoo Pipes 1.0')
                request = urllib2.build_opener().open(request)
                content = unicode(
                    request.read(),
                    request.headers['content-type'].split('charset=')[-1])

                # TODO it seems that Yahoo! converts relative links to absolute
                # TODO this needs to be done on the content but seems to be a non-trival
                # TODO task python?

                if context.verbose:
                    print "............FetchPage: content ................."
                    print content.encode("utf-8")
                    print "............FetchPage: EOF     ................."

                from_delimiter = util.get_value(conf["from"], _INPUT, **kwargs)
                to_delimiter = util.get_value(conf["to"], _INPUT, **kwargs)
                split_token = util.get_value(conf["token"], _INPUT, **kwargs)

                # determine from location, i.e. from where to start reading content
                from_location = 0
                if from_delimiter != "":
                    from_location = content.find(from_delimiter)
                    # Yahoo! does not strip off the from_delimiter.
                    #if from_location > 0:
                    #    from_location += len(from_delimiter)

                # determine to location, i.e. where to stop reading content
                to_location = 0
                if to_delimiter != "":
                    to_location = content.find(to_delimiter, from_location)

                # reduce the content depended on the to/from locations
                if from_location > 0 and to_location > 0:
                    content = content[from_location:to_location]
                elif from_location > 0:
                    content = content[from_location:]
                elif to_location > 0:
                    content = content[:to_location]

                # determine items depended on the split_token
                res_items = []
                if split_token != "":
                    res_items = content.split(split_token)
                else:
                    res_items = [content]

                if context.verbose:
                    print "FetchPage: found count items:", len(res_items)

                for res_item in res_items:
                    if context.verbose:
                        print "--------------item data --------------------"
                        print res_item
                        print "--------------EOF item data ----------------"
                    yield {"content": res_item}

            except Exception, e:
                if context.verbose:
                    print "FetchPage: failed to retrieve from:", url

                    print "----------------- FetchPage -----------------"
                    import traceback
                    traceback.print_exc()
                    print "----------------- FetchPage -----------------"
                raise

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break
Beispiel #54
0
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to (new or existing)
        loop_with -- pass a particular field into the submodule rather than the whole item
    embed -- embedded submodule
    
    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    """
    mode = conf['mode']['value']
    assign_to = conf['assign_to']['value']
    assign_part = conf['assign_part']['value']
    emit_part = conf['emit_part']['value']
    loop_with = conf['with']['value']
    embed_conf = conf['embed']['value']['conf']

    #Prepare the submodule to take parameters from the loop instead of from the user
    embed_context = copy.copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        if loop_with:
            inp = util.get_subkey(loop_with, item)
        else:
            inp = item

        #Pass any input parameters into the submodule
        embed_context.inputs = {}
        for k in embed_conf:
            embed_context.inputs[k] = unicode(
                util.get_value(embed_conf[k], item))
        p = embed(embed_context, [inp], embed_conf)  #prepare the submodule

        results = None
        try:
            #loop over the submodule, emitting as we go or collecting results for later assignment
            for i in p:
                if assign_part == 'first':
                    if mode == 'EMIT':
                        yield i
                    else:
                        results = i
                    break
                else:  #all
                    if mode == 'EMIT':
                        yield i
                    else:
                        if results:
                            results.append(i)
                        else:
                            results = [i]
            if results and mode == 'assign':
                #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc.
                #(goes with the comment below about checking the delivery capability of the source)
                if len(results) == 1 and isinstance(results[0], dict):
                    results = [results]
        except HTTPError:  #todo any other errors we want to continue looping after?
            if context.verbose:
                print "Submodule gave HTTPError - continuing the loop"
            continue

        if mode == 'assign':
            if results and len(
                    results
            ) == 1:  #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage
                results = results[0]
            util.set_value(item, assign_to, results)
            yield item
        elif mode == 'EMIT':
            pass  #already yielded
        else:
            raise Exception("Invalid mode %s (expecting assign or EMIT)" %
                            mode)
Beispiel #55
0
def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    """
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "XPathFetchPage: Preparing to download:", url

            try:
                request = urllib2.Request(url)
                request.add_header('User-Agent', 'Yahoo Pipes 1.0')
                request = urllib2.build_opener().open(request)
                content = unicode(
                    request.read(),
                    request.headers['content-type'].split('charset=')[-1])

                # TODO it seems that Yahoo! converts relative links to absolute
                # TODO this needs to be done on the content but seems to be a non-trival
                # TODO task python?

                xpath = util.get_value(conf["xpath"], _INPUT, **kwargs)
                html5 = False
                useAsString = False
                if "html5" in conf:
                    html5 = util.get_value(conf["html5"], _INPUT,
                                           **kwargs) == "true"
                if "useAsString" in conf:
                    useAsString = util.get_value(conf["useAsString"], _INPUT,
                                                 **kwargs) == "true"

                if html5:
                    #from lxml.html import html5parser
                    #root = html5parser.fromstring(content)
                    from html5lib import parse
                    root = parse(content,
                                 treebuilder='lxml',
                                 namespaceHTMLElements=False)
                else:
                    from lxml import etree
                    root = etree.HTML(content)
                res_items = root.xpath(xpath)

                if context.verbose:
                    print "XPathFetchPage: found count items:", len(res_items)

                for res_item in res_items:
                    i = util.etree_to_pipes(
                        res_item)  #TODO xml_to_dict(res_item)
                    if context.verbose:
                        print "--------------item data --------------------"
                        print i
                        print "--------------EOF item data ----------------"
                    if useAsString:
                        yield {"content": unicode(i)}
                    else:
                        yield i

            except Exception, e:
                if context.verbose:
                    print "XPathFetchPage: failed to retrieve from:", url

                    print "----------------- XPathFetchPage -----------------"
                    import traceback
                    traceback.print_exc()
                    print "----------------- XPathFetchPage -----------------"
                raise

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
            break