def farith(*args): """ .. function:: farith(calc) -> float or Fraction Takes as input a mathematical expression in polish notation and computes the result using fractional computation Examples: >>> sql("select farith('+',5,7)" ) farith('+',5,7) --------------- 12 >>> sql("select farith('-','*','/',15,'-',7,'+',1,1,3,'+',2,'+',1,1)" ) farith('-','*','/',15,'-',7,'+',1,1,3,'+',2,'+',1,1) ---------------------------------------------------- 5 """ s = [] for i in reversed(args): if i in ('*', '/', '-', '+'): operand1 = s.pop() operand2 = s.pop() if i == '+': operand = operand1 + operand2 elif i == '-': operand = operand1 - operand2 elif i == '/': operand = operand1 / operand2 elif i == '*': operand = operand1 * operand2 s.append(operand) else: if type(i) in (int, float, long): operand = Fraction(i) s.append(operand) else: try: s.append(Fraction(*json.loads(i))) except ValueError as e: raise functions.OperatorError( 'farith', "invalid expression found: '" + i + "'") return simplify_fraction(s.pop())
def maincode(args, boolargs, nonstringargs, needsescape, notsplit, db, func, retalways, connectionhandler): autostring = 'automatic_vtable' try: largs, kargs = argsparse.parse(args, boolargs, nonstringargs, needsescape, notsplit) except Exception as e: raise functions.MadisError(e) if 'query' not in kargs: raise functions.OperatorError( func.__globals__['__name__'].rsplit('.')[-1], "needs query argument ") query = kargs['query'] del kargs['query'] if autostring in kargs: del kargs[autostring] return doall(query, db, func, retalways, connectionhandler, *largs, **kargs)
def step(self, *args): if self.notchecked: if len(args) < 2: raise functions.OperatorError("groupsum", "Wrong number of arguments") self.grouplen = args[0] self.numofargs = len(args) self.notchecked = False groupkey = args[1:self.grouplen + 1] try: group = self.groupsdict[groupkey] j = 0 for i in xrange(self.grouplen + 1, self.numofargs): group[j].append(args[i]) j += 1 except KeyError: self.groupsdict[groupkey] = [[x] for x in args[self.grouplen + 1:]]
def apachelogsplit(*args): """ .. function:: apachelogsplit(apache_log_line) -> [ip, ident, authuser, date, request, status, bytes, referrer, useragent] Breaks a single apache log row into multiple fields. Examples: >>> table1(''' ... '1.1.1.1 - - [01/Feb/2001:01:02:03 +0001] "HEAD /test.com HTTP/1.1" 200 - "-" "reftest"' ... ''') >>> sql("select apachelogsplit(a) from table1") ip | ident | authuser | date | method | uri | httpver | status | bytes | referrer | useragent ---------------------------------------------------------------------------------------------------------------------------- 1.1.1.1 | None | None | 2001-02-01T01:02:03+0001 | HEAD | /test.com | 1.1 | 200 | None | None | reftest """ yield ('ip', 'ident', 'authuser', 'date', 'method', 'uri', 'httpver', 'status', 'bytes', 'referrer', 'useragent') f = apache_log_split.match(''.join(args).strip()) if f == None: raise functions.OperatorError("APACHELOGSPLIT", "Row function didn't receive any input") f = f.groups() f = [None if x == '-' else x for x in f] # parse date if f[3] != None: if f[3][4:7] in months: f[3] = f[3][1:-1] date = f[3] f[3] = date[7:11] + '-' + months[date[3:6]] + '-' + date[0:2] + 'T' + date[12:14] + ':' + date[ 15:17] + ':' + date[ 18:20] + date[ 21:] if f[7] != None: f[7] = int(f[7]) if f[8] != None: f[8] = int(f[8]) yield f
def reencode(*args): if len(args) != 1: raise functions.OperatorError("reencode", "operator takes only one arguments") us = args[0] if us == None: return None us = unicode(us) try: a = unicode(us.encode('iso-8859-1'), 'utf-8') return a except KeyboardInterrupt: raise except Exception: try: a = unicode(us.encode('windows-1252'), 'utf-8') return a except Exception: return us
def var(*args): """ .. function:: var(varname[, value]) -> value Sets (if both varname and value are given) or returns (if only varname is given) the contents of a variable. Examples: >>> sql("var 'v'") # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... OperatorError: Madis SQLError: Operator VAR: Variable 'v' does not exist >>> sql("var 'v' 5") var('v','5') ------------ 5 >>> sql("var 'v'") var('v') -------- 5 >>> sql("select var('v')") var('v') -------- 5 """ if len(args) == 0: return str(functions.variables.__dict__) var = args[0] if len(args) == 1: if hasattr(functions.variables, var): return functions.variables.__dict__[var] else: raise functions.OperatorError( 'var', "Variable '" + var + "' does not exist") elif len(args) == 2: functions.variables.__dict__[var] = args[1] return functions.variables.__dict__[var] else: return None
def VTiter(self, *parsedArgs, **envars): def authorizer(operation, paramone, paramtwo, databasename, triggerorview): """Called when each operation is prepared. We can return SQLITE_OK, SQLITE_DENY or SQLITE_IGNORE""" # find the operation name plan.append([ apsw.mapping_authorizer_function[operation], paramone, paramtwo, databasename, triggerorview ]) return apsw.SQLITE_OK def buststatementcache(): c = connection.cursor() for i in xrange(110): a = list(c.execute("select " + str(i))) _, dictargs = self.full_parse(parsedArgs) yield [('operation', 'text'), ('paramone', 'text'), ('paramtwo', 'text'), ('databasename', 'text'), ('triggerorview', 'text')] if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], " needs query argument ") query = dictargs['query'] connection = envars['db'] plan = [] buststatementcache() cursor = connection.cursor() cursor.setexectrace(lambda x, y, z: apsw.SQLITE_DENY) connection.setauthorizer(authorizer) cursor.execute(query) connection.setauthorizer(None) for r in plan: yield r
def step(self, *args): if not args: raise functions.OperatorError("frecencyindex","No arguments") now = datetime.datetime.now() now = iso8601.parse_date(now.strftime("%Y-%m-%d %H:%M:%S")) d = args[0].replace('T',' ') dt = iso8601.parse_date(args[0].replace('Z','')) diff=now-dt if (diff.days)<30: self.monthCounter+=1 elif (diff.days)<3*30: self.trimesterCounter+=1 elif (diff.days)<6*30: self.semesterCounter+=1 elif (diff.days)<12*30: self.yearCounter+=1 elif (diff.days)<24*30: self.twoyearsCounter+=1
def step(self, *args): if not args: raise functions.OperatorError("frecency", "No arguments") # last 2 arguments are static , so they are parse only the first time if not self.initstatic: self.initstatic = True self.points = 100.0 self.now = datetime.datetime.now() if len(args) >= 2: for arg in args[1:]: isnowarg = re_now.match(arg) if isnowarg: nowdate = isnowarg.groupdict()['now'] self.now = iso8601.parse_date(nowdate) else: self.points = int(arg) input = args[0] dt = iso8601.parse_date(input) self.frecency += self.__decrease(self.now - dt) * self.points
def jdict(*args): """ .. function:: jdict(key, value, key1, value1) -> jdict Returns a jdict of the keys and value pairs. Examples: >>> sql(''' select jdict('key1', 'val1', 'key2', 'val2') ''') # doctest: +NORMALIZE_WHITESPACE jdict('key1', 'val1', 'key2', 'val2') ------------------------------------- {"key1":"val1","key2":"val2"} >>> sql(''' select jdict('key', '{"k1":1,"k2":2}') ''') # doctest: +NORMALIZE_WHITESPACE jdict('key', '{"k1":1,"k2":2}') ------------------------------- {"key":{"k1":1,"k2":2}} >>> sql(''' select jdict('key', '["val1", "val2"]') ''') # doctest: +NORMALIZE_WHITESPACE jdict('key', '["val1", "val2"]') -------------------------------- {"key":["val1","val2"]} >>> sql(''' select jdict('1') ''') # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... OperatorError: Madis SQLError: Operator JDICT: At least two arguments required """ if len(args) == 1: raise functions.OperatorError('jdict', "At least two arguments required") result = OrderedDict() for i in xrange(0, len(args), 2): result[args[i]] = jopts.fromjsingle(args[i + 1]) return jopts.toj(result)
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query, parse=False) schema = [('_rowid_',)] schemaorder = {} record = [] r1=[] firstrow = c.next() schema.append((firstrow[1],)) record.append(firstrow[0]) record.append(firstrow[2]) for row in c: if row[0] == firstrow[0]: schema.append((row[1],)) record.append(row[2]) else: firstrow=row r1.append(row[0]) r1.append(row[2]) break yield schema yield record record=r1 for row in c: if row[0] == firstrow[0]: record.append(row[2]) else: firstrow=row r1=[] r1.append(row[0]) r1.append(row[2]) yield record record=r1 yield record
def step(self, *args): if self.init: self.init = False if not args: raise functions.OperatorError("fsum", "No arguments") try: if type(args[0]) in (int, float, long): x = Fraction(args[0]) else: try: json_object = json.loads(args[0]) x = Fraction(json_object[0], json_object[1]) except ValueError, e: return except KeyboardInterrupt: raise except: return self.x += x
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query) schema = cur.getdescriptionsafe() init = True for myrow in c: # assume that it is ordered by nodeno # print myrow level = int(myrow[0]) #currentlevel nodestoinsert = ast.literal_eval( myrow[1]) #nodes of the level at hand. It is a dice for i in nodestoinsert: if str(i['leafval']) == "": i.pop('leafval') if str(i['childnodes']) == "": i.pop('childnodes') if init is True: for k in nodestoinsert: k.pop('id') resulttable = nodestoinsert init = False else: # print "AA", resulttable recursive_checkchilds(resulttable, nodestoinsert, level) # print "RESULT",resulttable yield [ ('result', ), ] # print str(resulttable) yield [ str(resulttable).replace("'", "\""), ]
def contains(*args): """ .. function:: contains(str1,str2) -> bool Returns true if string *str1* contains *str2*. Examples: >>> sql("select contains('test string', 'str') as test ") test ---- 1 >>> sql("select contains('test string', 'nostr') as test ") test ---- 0 """ if len(args) != 2: raise functions.OperatorError("included", "operator takes exactly two arguments") if (args[1] in args[0]): return True return False
def pyfunerrtonul(*args): """ .. function:: pyfunerrtonul(pyfunction, parameters) Calls a python function and returns the result. If an error occurs it returns *null*. >>> sql("select pyfunerrtonul('math.sqrt', -1)") pyfunerrtonul('math.sqrt', -1) ------------------------------ None >>> sql("select pyfunerrtonul('math.log10', -1)") pyfunerrtonul('math.log10', -1) ------------------------------- None """ if len(args) == 0: return fsplit = args[0].split('.') try: f = __import__(fsplit[0]) for i in fsplit[1:]: f = f.__dict__[i] except: try: f = __import__('libexternal' + '.' + fsplit[0]) for i in fsplit: f = f.__dict__[i] except: raise functions.OperatorError("pyfunerrtonul", "didn't find function: " + args[0]) try: res = f(*args[1:]) except Exception, e: return None
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] c = envars['db'].cursor() q = c.execute(query, parse=False) try: yield list(c.getdescriptionsafe()) except StopIteration: try: raise finally: try: c.close() except: pass for _ in q: pass
def regexprfindall(*args): """ .. function:: regexprfindall(pattern,text) This function returns *all* matches of *pattern* in text. Examples: >>> sql("select regexprfindall('\w+', 'one')") regexprfindall('\w+', 'one') ---------------------------- ["one"] >>> sql("select regexprfindall('\w+', 'one two three')") regexprfindall('\w+', 'one two three') -------------------------------------- ["one","two","three"] """ if len(args) != 2: raise functions.OperatorError('regexprfindall', 'Two parameters should be provided') return jopts.tojstrict(re.findall(args[0], unicode(args[1]), re.UNICODE))
def sunitouni(*args): """ .. function:: sunitouni(str) Returns *str* replacing literal str code points to their string representation. Examples: >>> sql("select sunitouni('br\\u00fbl\\u00e9') as test ") test ------- brûlé >>> sql("select sunitouni('\\u that is not a str code point') as test ") test ----------------------------------- \u that is not a str code point >>> sql("select sunitouni(null)") sunitouni(null) --------------- None >>> sql("select sunitouni(9)") sunitouni(9) ------------ 9 """ if len(args) != 1: raise functions.OperatorError("sunitouni", "operator takes only one arguments") if args[0] == None: return None kk = "u'%s'" % (str(args[0]).replace("'", "\\'")) try: return eval(kk) except KeyboardInterrupt: raise except Exception: return args[0]
def normuni(*args): """ .. function:: normuni(str) Returns *str* normalised in the composed str normal form without replacing same look characters. For example this 'À' character can be encoded with one or two different characters, :func:`normuni` returns an one-character encoded version. This function is important to check true strings equality. Functions :func:`sunitouni` and :func:`unitosuni` are used in the examples to make it more comprehensive. Examples: .. note:: Returned results in the next two examples should look the same, if not that is a bug at the combined characters rendering of the shell that the documentation was created. >>> sql("select sunitouni('C\u0327') as test ") test ---- Ç >>> sql("select normuni(sunitouni('C\u0327')) as test ") test ---- Ç >>> sql("select unitosuni(normuni(sunitouni('C\u0327'))) as test ") test ------ \u00c7 """ if len(args) != 1: raise functions.OperatorError("normuni", "operator takes only one arguments") if args[0] == None: return None return strdata.normalize('NFC', args[0])
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) self.nonames = True self.names = [] self.types = [] if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query) yield [('c1', )] output = StringIO.StringIO() writer = csv.writer(output) for r in c: writer.writerow(r) ll = output.getvalue().splitlines() for row in ll: yield (row, )
def requirevars(*args): """ .. function:: requirevars(varname1, [varname2,...]) Checks if all variables (varname1,...) exist. If not it throws an exception. Examples: >>> sql("var 'cv1' 5") var('cv1','5') -------------- 5 >>> sql("var 'cv2' 10") var('cv2','10') --------------- 10 >>> sql("requirevars 'cv1' 'cv2'") requirevars('cv1','cv2') ------------------------ 1 >>> sql("requirevars cv1 cv2") requirevars('cv1 cv2') ---------------------- 1 >>> sql("requirevars 'cv1' 'testvar'") # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... OperatorError: Madis SQLError: Operator REQUIREVARS: Variable testvar isn't initialized """ for v in (' '.join(args).strip()).split(): if not hasattr(functions.variables, v): raise functions.OperatorError("requirevars", "Variable %s isn't initialized" % v) return 1
def regexprmatches(*args): """ .. function:: regexprmatches(pattern, arg) This function returns true if the pattern matches arg or false otherwise. Examples use `inversion`. Examples: >>> sql("regexprmatches '(a)' 'qwer a qwer' ") regexprmatches('(a)','qwer a qwer') ----------------------------------- 1 """ if len(args) != 2: raise functions.OperatorError('regexprmatches', 'Two parameters should be provided') a = re.search(args[0], unicode(args[1]), re.UNICODE) if a != None: return True else: return False
def flowname(*args): """ .. function:: flowname([str]) Sets and retrieves, 'flowname' variable Examples: >>> sql("flowname test flow ") flowname('test flow') --------------------- test flow >>> sql("flowname") flowname() ---------- test flow >>> sql("flowname 'arg1' arg2") # doctest: +NORMALIZE_WHITESPACE Traceback (most recent call last): ... OperatorError: Madis SQLError: Operator FLOWNAME: Flowname accepts only 1 argument """ var = 'flowname' if len(args) > 1: raise functions.OperatorError('flowname', 'Flowname accepts only 1 argument') if len(args) == 0 and hasattr(functions.variables, var): return str(functions.variables.__dict__[var]) elif len(args) == 1: functions.variables.__dict__[var] = ' '.join( [str(x) for x in args[0:]]) return str(functions.variables.__dict__[var]) else: return None
def jintersection(*args): """ .. function:: jintersection(jpackA, jpackB) -> jpack Returns the items of jpackA except the items that appear on jpackB. Examples: >>> sql("select jintersection('[1,2,3]', '[1,2,3]')") # doctest: +NORMALIZE_WHITESPACE jintersection('[1,2,3]', '[1,2,3]') ----------------------------------- [1,2,3] >>> sql("select jintersection('[1,2,3]', '[1,3]', 1)") # doctest: +NORMALIZE_WHITESPACE jintersection('[1,2,3]', '[1,3]', 1) ------------------------------------ 1 """ if len(args) < 2: raise functions.OperatorError("jintersection","operator needs at least two inputs") return jopts.toj(sorted(set.intersection(*[set(jopts.fromj(x)) for x in args])))
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] connection = envars['db'] yield (('column', 'text'), ('type', 'text')) cur = connection.cursor() execit = cur.execute(query, parse=False) try: samplerow = execit.next() except StopIteration: pass vals = cur.getdescriptionsafe() cur.close() for i in vals: yield i
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) self.nonames = True self.names = [] self.types = [] if 'query' not in dictargs: raise functions.OperatorError(__name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query) schema = cur.getdescriptionsafe() schema1 = [] first_row = c.next() first_tuple = [] j = 0 for i in first_row: if is_number(i): schema1.append(schema[j]) first_tuple.append(i) j += 1 yield tuple(schema1) yield tuple(first_tuple) for row in c: tmp_row = [] j = 0 for col in row: if schema[j] in schema1: tmp_row.append(col) j += 1 yield tmp_row
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if 'key' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No URL argument ") else: key = dictargs['key'] if 'rate' in dictargs: rate = dictargs['rate'] else: rate = 0 schema = [('id', 'text'), ('tweet', 'text'), ('name', 'text'), ('location', 'text'), ('favourites', 'text'), ('screen_name', 'text'), ('friends', 'text'), ('followers', 'text'), ('sentiment', 'text')] yield schema db = 'db5.db' tname = key rows = checkTableMetadata(key, rate, tname, db) if rows is not None: #print "GETTING TABLE FROM CACHE" for r in rows: yield r return api = twitter.Api( consumer_key='5vQVQ4B8bUcNGG3WOKr80gPdQ', consumer_secret= 'jkQw1PPQrKcKddBjg6AqYNH3n7cAogXhNTwf4m13urR37zKUdG', access_token_key= '747542150561341440-RyK8r6AA0iCr3w5cbuNKmcxCDRfdJ42', access_token_secret='v5PfDnaLCIRu8KyLmfzXDOrykUtK96mmIwkTQNoUHG7mW' ) results = api.GetSearch(raw_query="l=&q=" + key + "%20-filter%3Aretweets&count=100") tuples = [] sentiment = '' host = socket.gethostname() port = 12345 for r in results: tweet = unicode(r.text) #s.connect((host, port)) #s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) #s.connect((host, port)) #s.sendall(r.text.encode('UTF-8')) #sentiment = s.recv(1024) t = (r.id, tweet, r.user.name, r.user.location, r.user.favourites_count, r.user.screen_name, r.user.friends_count, r.user.followers_count, sentiment) yield t tuples.append(t) s.close() createTable(db, tname, schema, tuples) #materialize the table #getMaterializedContent(db,tname) gc.enable()
def VTiter(self, *parsedArgs, **envars): largs, dictargs = self.full_parse(parsedArgs) if len(largs) < 1: raise functions.OperatorError( __name__.rsplit('.')[-1], "Not defined union tables ") streams = str(largs[0]).split(",") if len(streams) < 2: raise functions.OperatorError( __name__.rsplit('.')[-1], "Union tables must be more than one ") cursors = [] execs = [] for stream in streams: cursors.append(envars['db'].cursor()) execs.append(cursors[-1].execute("select * from " + str(stream) + ";")) comparedcursor = str(cursors[0].getdescriptionsafe()) # for cursor in cursors: # if str(cursor.getdescriptionsafe()) != comparedcursor: # raise functions.OperatorError(__name__.rsplit('.')[-1],"Union tables with different schemas ") if 'cols' in dictargs: try: cols = int(dictargs['cols']) except ValueError: try: cols = [y[0] for y in cursors[0].getdescriptionsafe() ].index(dictargs['cols']) except ValueError: raise functions.OperatorError( __name__.rsplit('.')[-1], "Column name does not exists ") else: cols = 0 if cols >= len(cursors[0].getdescriptionsafe()): raise functions.OperatorError( __name__.rsplit('.')[-1], "Column position does not exists ") for x in range(0, len(streams)): if x is 0: execs[0] = ((v[cols], (0, ) + v) for v in execs[0]) elif x is 1: execs[1] = ((v[cols], (1, ) + v) for v in execs[1]) elif x is 2: execs[2] = ((v[cols], (2, ) + v) for v in execs[2]) elif x is 3: execs[3] = ((v[cols], (3, ) + v) for v in execs[3]) elif x is 4: execs[4] = ((v[cols], (4, ) + v) for v in execs[4]) try: yield list(cursors[0].getdescriptionsafe()) except StopIteration: try: raise finally: try: for cur in cursors: cur.close() except: pass currentgroup = None lists = [[]] * len(streams) for k, v in heapq.merge(*execs): if currentgroup is None or currentgroup != k: unionset = set().union(*lists[1:]) for t in (set(lists[0]) - unionset): yield t lists = [[]] * len(streams) lists[v[0]] = lists[v[0]] + [tuple(v[1:])] currentgroup = k unionset = set().union(*lists[1:]) for t in list(set(lists[0]) - unionset): yield t
def VTiter(self, *parsedArgs, **envars): import sklearn largs, dictargs = self.full_parse(parsedArgs) if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] print 'MADIS/QUERY', query cur = envars['db'].cursor() c = cur.execute(query, parse=False) schema = [] try: schema = [x[0] for x in cur.getdescriptionsafe()] except StopIteration: try: raise finally: try: c.close() except: pass if 'filename' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No filename provided") f = open(dictargs['filename'], 'w') if 'initstr' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No initialization string") initstr = dictargs['initstr'] #-- IMPORT MODULES --- import itertools ######################################################3 #from sklearn.cluster import * #from sklearn.linear_model import * #from sklearn.neighbors import * #from sklearn.svm import * #from sklearn.naive_bayes import * #from sklearn.tree import * #from sklearn.ensemble import * #from sklearn.model_selection import * ######################################################## ### to specify imports # from sklearn.cross_validation import * print 'MADIS/sklearn version', sklearn.__version__ # from sklearn.cluster import AgglomerativeClustering import cPickle as cp import numpy as np # import unicodedata import zlib # -------------------- model = eval(initstr) print 'MADIS/MODEL:', model if 'classname' not in dictargs: # raise functions.OperatorError(__name__.rsplit('.')[-1],"No classname argument ") trainList = [] for row in c: trainList = [row for row in c] train = np.array(trainList).astype(np.float) model.fit(train) pstr = cp.dumps(model, 2) f.write(zlib.compress(pstr, 3)) yield [('id', ), ('cluster_label', )] for i in xrange(0, len(train)): yield (i, int(model.labels_[i])) else: classname = dictargs['classname'] idclassname = schema.index(classname) trainList = [] targetList = [] cv_func = '' cv = 0 if 'cv' not in dictargs: cv = 5 else: cv = int(dictargs['cv']) #Constructing group of samples: if 'groupname' in dictargs: groupname = '' groups = [] groupname = dictargs['groupname'] idgroupname = schema.index(groupname) groupList = [] # print 'trainlist:', trainList for i, row in enumerate(c): trainList.append( list(row[0:idclassname] + row[idclassname + 1:len(row)])) targetList.append(int(row[idclassname])) if 'groupname' in dictargs: groupList.append(row[idgroupname]) groups = np.array(groupList) del trainList[i][idgroupname] else: groups = None X = np.array(trainList).astype(np.float) y = np.array(targetList).astype(np.int) preds = [] pred_probs = [] # print 'MADIS/GROUPS?: ',groups preds = cross_val_predict(model, X, y, cv=cv, groups=groups) # pred_probs = cross_val_predict(model, X, y, cv=cv_func,method='predict_proba') # if model.probability: if hasattr(model, 'probability') and model.probability: pred_probs = cross_val_predict(model, X, y, cv=cv, groups=groups, method='predict_proba') # print 'MADIS/preds',preds # print 'MADIS/probs',pred_probs #Fit again and Store model in disk: model.fit(X, y) # pred_probs = model.predict_proba(X) pstr = cp.dumps(model, 2) f.write(zlib.compress(pstr, 3)) # print 'MADIS/CLASSNAMES',model.classes_ # yield tuple(['id','predicted_label'] + ['center'+str(i) for i in xrange(1,len(self.sample[0])+1)]) # yield [('id',), ('predicted_label',), ('prediction_probability',),([tuple('probability_'+str(i)+',') for i in range(len(model.classes_))])] if hasattr(model, 'probability') and model.probability: # if model.probability: yield [('id', ), ('predicted_label', ), ('prediction_probability', ), ('probs_per_class', )] for i in range(len(X)): pred = preds[i] yield (i, int(pred), pred_probs[i][pred], str([ pred_probs[i][j] for j in range(len(model.classes_)) ])) # yield (i, int(pred), pred_probs[i][pred], [pred_probs[i][j] for j in range(len(model.classes_))]) else: yield [ ('id', ), ('predicted_label', ), ] for i in range(len(X)): pred = preds[i] yield (i, int(pred))
def VTiter(self, *args,**formatArgs): largs, dictargs = self.full_parse(args) where = None mode = 'row' if 'file' in dictargs: where=dictargs['file'] else: raise functions.OperatorError(__name__.rsplit('.')[-1],"No destination provided") if 'mode' in dictargs: mode = dictargs['mode'] col = 0 if 'cols' in dictargs: a = re.split(' |,| , |, | ,' , dictargs['cols']) column = [x for x in a if x != ''] else: col = 1 filename, ext=os.path.splitext(os.path.basename(where)) fullpath=os.path.split(where)[0] fileIter=open(where, "rb") selectcols = (10,11) filtercols = [10] value = "1993-01-26" indices = [] if mode == 'spac': import msgpack blocksize = struct.unpack('!i',fileIter.read(4)) b = struct.unpack('!B',fileIter.read(1)) schema = cPickle.load(fileIter) colnum = len(schema) found = 0 index_found = 0 blocknum = 0 myvals = [[None] for _ in xrange(colnum)] myfiltervals = [[None] for _ in xrange(colnum)] selectschema = [str(x) for x in selectcols] yield selectschema input = cStringIO.StringIO() while True: input.truncate(0) blocknum += 1 d = 0 ind = [0 for _ in xrange(colnum*4+1)] try: blocksize = struct.unpack('!i',fileIter.read(4)) except: break input.write(fileIter.read(blocksize[0])) input.seek(0) block_kind = struct.unpack('!B',input.read(1)) compression_bit = struct.unpack('!B',input.read(1)) type = '!'+'i'*(colnum*4+1) ind = list(struct.unpack(type, input.read(4*(colnum*4+1)))) d2 = [[] for _ in xrange(len(selectcols))] d3 = [[] for _ in xrange(len(filtercols))] def binarySearch(alist, item): first = 0 last = len(alist)-1 found = False midpoint = (first + last)//2 while first<=last and not found: midpoint = (first + last)//2 if alist[midpoint] == item: found = True else: if item < alist[midpoint]: last = midpoint-1 else: first = midpoint+1 return midpoint, found ### filter evaluation for index,col in enumerate(filtercols): indices = [] input.seek(ind[col*4+3]) if ind[col*4+2] == 0: #full data block column = msgpack.loads(zlib.decompress(input.read(ind[col*4]))) myfiltervals[col] = column if len(myfiltervals[col])<256: listptr = array('B') elif len(myfiltervals[col])<65536: listptr = array('H') else: listptr = array('i') listptr.fromstring(zlib.decompress(input.read(ind[col*4+1]))) if (value >= myfiltervals[col][0] and value<=myfiltervals[col][len(myfiltervals[col])-1]): t = binarySearch(myfiltervals[col],value) if t[1]: found = 1 index_found = t[0] for j,i in enumerate(listptr): if i == index_found: indices.append(j) else: found = 0 else: found = 0 else: # differential block column = msgpack.loads(zlib.decompress(input.read(ind[col*4]))) if found: myfiltervals[col] = myfiltervals[col] + [None] * len(column) if len(myfiltervals[col])<256: listptr = array('B') elif len(myfiltervals[col])<65536: listptr = array('H') else: listptr = array('i') listptr.fromstring(zlib.decompress(input.read(ind[col*4+1]))) for j,i in enumerate(listptr): if i == index_found: indices.append(j) else: if (value >= column[0] and value<=column[len(column)-1]): t = binarySearch(column,value) if t[1]: found = 1 index_found = t[0] + len(myfiltervals[col]) if len(myfiltervals[col])+len(column)<256: listptr = array('B') elif len(myfiltervals[col])+len(column)<65536: listptr = array('H') else: listptr = array('i') listptr.fromstring(zlib.decompress(input.read(ind[col*4+1]))) for j,i in enumerate(listptr): if i == index_found: indices.append(j) myfiltervals[col] = myfiltervals[col] + [None] * len(column) #### end of filter evaluation for index,col in enumerate(selectcols): input.seek(ind[col*4+3]) column = msgpack.loads(zlib.decompress(input.read(ind[col*4]))) if ind[col*4+2] == 0: myvals[col] = column else: myvals[col] = myvals[col] + column if len(myvals[col])<256: listptr = array('B') elif len(myvals[col])<65536: listptr = array('H') else: listptr = array('i') if (ind[col*4+1]==0 and ind[col*4+2] == 0): for i in xrange(ind[len(ind)-1]): d2[index].append(myvals[col][0]) else: listptr.fromstring(zlib.decompress(input.read(ind[col*4+1]))) if len(filtercols)>0: for i in indices: d2[index].append(myvals[col][listptr[i]]) else: for i in listptr: d2[index].append(myvals[col][i]) for row in izip(*d2): yield row if mode == 'sorteddictpercol': # if col: # print "lala" # gc.disable() # schema = marshal.load(fileIter) # colnum = len(schema) # cols = [[] for _ in xrange(colnum)] # yield schema # listptr = [array('H') for _ in xrange(colnum) ] # while True: # try: # row=0 # d = 0 # ind = struct.unpack('L'*(colnum+2), fileIter.read(8*(colnum+2))) # for i in xrange(colnum): # cols[i] = marshal.load(fileIter) # listptr[i].fromfile(fileIter,ind[colnum+1]) # for row in xrange (ind[colnum+1]): # tup = [0 for _ in xrange(colnum)] # for col in xrange(colnum): # tup[col] = cols[col][listptr[col][row]] # yield tup # tup = [] # # listptr = [array('H') for _ in xrange(colnum) ] # except: # break # gc.enable() # elif len(column) == 1: # schema = marshal.load(fileIter) # colid = [x[0] for x in schema].index(column[0]) # colnum = len(schema) # yield [schema[colid]] # # while True: # try: # ind = struct.unpack('L'*(colnum+2), fileIter.read(8*(colnum+2))) # listptr = array('H') # next=ind[colnum] # fileIter.seek(ind[colid]) # col = marshal.load(fileIter) # listptr.fromfile(fileIter,ind[colnum+1]) # for c in listptr: # yield(col[c],) # fileIter.seek(next) # except: # break # # # else: import msgpack schema = msgpack.load(fileIter) colnum = len(schema) yield schema output = cStringIO.StringIO() blocknum = 0 paxcols = {} while True: try: output.truncate(0) blocksize = struct.unpack('i', fileIter.read(4)) # output.write(fileIter.read(blocksize[0])) # output.seek(0) ind = list(struct.unpack('L'*(colnum*2+1), fileIter.read(8*(colnum*2+1)))) d2 = [[] for _ in xrange(colnum)] for c in xrange(colnum): s = cPickle.loads(zlib.decompress(fileIter.read(ind[c*2]))) if (blocknum == 1 and c in paxcols) or (blocknum == 0 and len(s)>50*1.0*ind[colnum*2]/100): d2[c] = s if blocknum == 0: paxcols[c]=1 else: if len(s)==1: d2[c] = [s[0] for _ in xrange(ind[colnum*2])] elif len(s)<256: listptr = array('B') listptr.fromstring(zlib.decompress(fileIter.read(ind[c*2+1]))) for lala in listptr: d2[c].append(s[lala]) else: listptr = array('H') listptr.fromstring(zlib.decompress(fileIter.read(ind[c*2+1]))) for lala in listptr: d2[c].append(s[lala]) for row in izip(*d2): yield row blocknum = 1 except: break if mode == 'dictperval': if col: gc.disable() schema = cPickle.load(fileIter) colnum = len(schema) cols = [[] for _ in xrange(colnum)] yield schema listptr = [array('H') for _ in xrange(colnum) ] while True: try: row=0 d = 0 ind = struct.unpack('L'*(colnum+3), fileIter.read(8*(colnum+3))) for i in xrange(colnum): cols[i] = cPickle.load(fileIter) for i in xrange(colnum): listptr[i].fromfile(fileIter,ind[colnum+2]) for row in xrange (ind[colnum+2]): tup = [0 for _ in xrange(colnum)] for col in xrange(colnum): tup[col] = cols[col][listptr[col][row]] yield tup tup = [] listptr = [array('H') for _ in xrange(colnum) ] except: break gc.enable() elif len(column) == 1: schema = cPickle.load(fileIter) colid = [x[0] for x in schema].index(column[0]) colnum = len(schema) yield [schema[colid]] while True: try: ind = struct.unpack('L'*(colnum+3), fileIter.read(8*(colnum+3))) next=ind[colnum+1] fileIter.seek(ind[colid]) col = cPickle.load(fileIter) fileIter.seek(ind[colnum]) listptr = [array('H') for _ in xrange(colnum) ] for i in xrange(colnum): listptr[i].fromfile(fileIter,ind[colnum+2]) for c in listptr[colid]: yield(col[c],) fileIter.seek(next) except: break else: schema = marshal.load(fileIter) lcols = [] for c in column: lcols.append([x[0] for x in schema].index(c)) colnum = len(schema) yield [schema[lcols[i]] for i in xrange(len(lcols))] while True: row = 0 try: d=0 ind = list(struct.unpack("<%dL" % ((colnum+1) * 2), fileIter.read(8*(colnum+1)))) next=ind[len(ind)-2] d2 = [[] for _ in xrange(len(lcols))] j = 0 for c in lcols: fileIter.seek(ind[c*2]) d2[j] = marshal.load(fileIter) j+=1 while True: tup = [] for col in xrange(len(lcols)): try: tup.append(d2[col][row]) except : d = 1 break if d == 1: break yield tup tup = [] row+=1 fileIter.seek(next) except: break if mode == 'rcstreampax': if col: schema = marshal.load(fileIter) colnum = len(schema) ENDFILE = 0 yield schema while True: row=0 d = 0 ind = [0 for _ in xrange(colnum+2)] if ENDFILE==1: try: marshal.load(fileIter) ENDFILE=0 except EOFError: break for i in xrange(colnum+2): ind[i] = struct.unpack('L',fileIter.read(8)) if ind[colnum+1][0] == 1: ENDFILE = 1 d2 = [[] for _ in xrange(colnum)] for col in xrange(colnum): obj = fileIter.read(ind[col+1][0]-ind[col][0]) d2[col] = marshal.loads(zlib.decompress(obj)) while True: tup = [] for col in xrange(colnum): try: tup.append(d2[col][row]) except : d = 1 break if d == 1: break yield tup tup = [] row+=1 elif len(column) == 1: schema = cPickle.load(fileIter) colid = [x[0] for x in schema].index(column[0]) colnum = len(schema) yield [schema[colid]] while True: try: ind = list(struct.unpack("<%dL" % ((colnum+1) * 2), fileIter.read(8*(colnum+1)))) next=ind[len(ind)-2] fileIter.seek(ind[colid*2]) d2 = cPickle.loads(zlib.decompress(fileIter.read(ind[colid*2+1]-ind[colid*2]))) #d2 = cPickle.load(fileIter) for c in d2: yield(c,) fileIter.seek(next) except: break else: schema = marshal.load(fileIter) lcols = [] for c in column: lcols.append([x[0] for x in schema].index(c)) colnum = len(schema) yield [schema[lcols[i]] for i in xrange(len(lcols))] while True: row = 0 try: d=0 ind = list(struct.unpack("<%dL" % ((colnum+1) * 2), fileIter.read(8*(colnum+1)))) next=ind[len(ind)-2] d2 = [[] for _ in xrange(len(lcols))] j = 0 fileIter.seek(ind[c*2]) d2[j] = marshal.load(fileIter) j+=1 while True: tup = [] for col in xrange(len(lcols)): try: tup.append(d2[col][row]) except : d = 1 break if d == 1: break yield tup tup = [] row+=1 fileIter.seek(next) except: break if mode == 'row': try: d2 = cPickle.Unpickler(fileIter).load() yield d2 while True: try: s = struct.unpack("i",fileIter.read(4)) for row in cPickle.loads(zlib.decompress(fileIter.read(s[0]))): yield row except: break except EOFError,e: pass