def _granulateDate(self, str_date): """returns only date parts that are succesfully parsed.""" di_1 = parseDate( str_date, ignoretz=True, fuzzy=True, default=datetime(1900, 12, 28, 0, 0) ) #1900-12-28 default year, month and day. (day 28 exists for every month:-) di_2 = parseDate( str_date, ignoretz=True, fuzzy=True, default=datetime(2000, 01, 01, 0, 0)) #2000-01-01 default year, month and day. ## Parsed date with no defaults used: if str(di_1.date()) == str(di_2.date()): return str( di_1.date()) # Dates are the same: date is parsed completely. ## Check for dft day and month: if di_1.date().day != di_2.date().day and di_1.date( ).month != di_2.date().month: return str( di_1.date().year) # Only year has been parsed succesfully. if di_1.date().day != di_2.date().day and di_1.date( ).month == di_2.date().month: return ('%s-%s') % ( di_1.date().year, di_1.date().month ) # Only year and month have been parsed succesfully.
def isDate(self, string): """ Returns True if string is a valid date format """ try: parseDate(string) return True except ValueError: return False
def _validateISO8601(self, datestring): ## See: http://labix.org/python-dateutil if datestring is None: return False try: parseDate(datestring, ignoretz=True, fuzzy=True) except ValueError: return False return True
def __load(self): if path.exists(self.__filePath): with open(self.__filePath, mode="r", newline="") as file: reader = csv.reader(file, delimiter=self.__delimiter) for item in reader: recordDate = parseDate(item[0]) event = item[1] when = parseDate(item[2]) version = item[3] parsed = (recordDate, event, when, version) self.__items.append(parsed)
def getEpisodes(self): # @todo Store last_sync_date days = 15 now = datetime.now().replace(tzinfo=pytz.UTC) fromdate = (now - timedelta(days=days-1)).strftime('%Y-%m-%d') response = self.request("calendars/my/shows/%s/%s" % (fromdate,days)) result = [] for episode in response: # Make sure the episode has aired if parseDate(episode["first_aired"]) < now: result.append({ "type": "show", "id": episode["show"]["ids"]["tvdb"], "episode_id": episode["episode"]["ids"]["tvdb"], "title": episode["show"]["title"], "episode_title": episode["episode"]["title"], "year": episode["show"]["year"], "season": episode["episode"]["season"], "episode": episode["episode"]["number"], }) return result
def get_element(element): text = "".join(n.data for n in element.childNodes if n.nodeType == n.TEXT_NODE) try: entry_type = element.getAttribute("type") if entry_type == "integer": try: return int(text) except ValueError: return 0 elif entry_type in ("date", "datetime"): return parseDate( text ) elif entry_type == "boolean": try: return text.strip().lower() in ("true", "1") except ValueError: return False elif entry_type == "decimal": try: return float(text) except ValueError: return 0.0 else: return text except: return text
def getDate_text(front, lastPage): #Flatten front; when split by WC splitter, it comes out as a list of strings front = flatten(front) if len(front) == 0: return '' dateString = '' date = '' dateList = [] # First, see if there is a "Date Decided" date, which is authoritative if dateDecidedRe0.search(front.lower()) != None: dateString = dateDecidedRe0.search(front.lower()).group() elif dateDecidedRe1.search(front.lower()) != None: dateString = dateDecidedRe1.search(front.lower()).group() # Second, see if there is a 'FILED' date (note this one is case sensitive, so no .lower()) if len(dateString) == 0: if dateFiledRe0.search(front) != None: dateString = dateFiledRe0.search(front).group() # Third, see if there is a 'Dated:" date in the frontmatter if len(dateString) == 0: if dateDatedRe0.search(front.lower()) != None: dateString = dateDatedRe0.search(front.lower()).group() elif dateDatedRe1.search(front.lower()) != None: dateString = dateDatedRe1.search(front.lower()).group() # Fourth, see if there is a 'Final Report:" date in the frontmatter if len(dateString) == 0: if finalReportDateRe0.search(front.lower()) != None: dateString = finalReportDateRe0.search(front.lower()).group() # Fifth, see if there is a 'Dated:" or ("Date:") date in the last page back = flatten(lastPage) if len(dateString) == 0: if dateDatedRe0.search(back.lower()) != None: dateString = dateDatedRe0.search(back.lower()).group() elif dateDatedRe1.search(back.lower()) != None: dateString = dateDatedRe1.search(back.lower()).group() elif dateDatedRe2.search(back.lower()) != None: dateString = dateDatedRe2.search(back.lower()).group() # Sixth, if no authoritative phrase, make a list of all dates in frontmatter if len(dateString) == 0: if standardDateRe0.search(front.lower()) != None: dateList.append(standardDateRe0.search(front.lower()).group()) if standardDateRe1.search(front.lower()) != None: dateList.append(standardDateRe1.search(front.lower()).group()) # ARBITRARILY CHOOSES FIRST OF ALL DATES IN THE DATELIST if len(dateList) > 0: dateString = dateList[0] # LAST, if possible, convert final results of date search # into MySQL formatted date (string: 'yyyy-mm-dd') if isDate(dateString): date = parseDate(dateString, fuzzy=True).date().isoformat() else: date = '0001-01-01' # Default date must be compliant with mysql date format return date
def pollAll(blogRecList,itemRecList): for blogRec in blogRecList: if blogRec['delete'] == False: pollRes = poll(blogRec) parseRes = parseXML(pollRes) if parseRes['blogRec']['status'] == True: blogRec = parseRes['blogRec'] firstTime = blogRec['firstTime'] items = parseRes['items'] for item in items: if item['pubDate'] != None: pDate = parseDate(item['pubDate']) if firstTime or pDate.date() == datetime.today(): newItemRec = {} newItemRec['title'] = item['title'] newItemRec['pubDate'] = item['pubDate'] newItemRec['link'] = item['link'] newItemRec['blogUrl'] = blogRec['url'] newItemRec['isNew'] = True isExist = False for itemRec in itemRecList: if itemRec['link'] == newItemRec['link']: isExist = True break if isExist == False: itemRecList.append(newItemRec) if firstTime: blogRec['firstTime'] = False print(blogRecList) print(itemRecList)
def get_element(element): text = ''.join( n.data for n in element.childNodes if n.nodeType == n.TEXT_NODE ) try: entry_type = element.getAttribute('type') if entry_type == 'integer': try: return int( text ) except ValueError: return 0 elif entry_type in ('date','datetime'): return parseDate( text ) elif entry_type == 'boolean': try: return text.strip().lower() in ('true', '1') except ValueError: return False elif entry_type == 'decimal': try: return float( text ) except ValueError: return 0.0 else: return text except: return text
def getDate(text): match = re.search(r"[a-z]{2,12}\.?\s?\d{1,2}\,\s?\d{2,4}", text.lower()) if match: try: return parseDate(match[0]) except ValueError: return "" return ""
def parser(self, value): try: return parseDate(value) except: raise ParseError( message="\"{}\" is not a valid datetime".format(value) ) return value
def __processFacebookDoc(self, doc): '''Converts a mongo document into user dictionary for matcher. It appends the headline to biography of the user. ''' user = {} if doc == None: return None # Commons user['username'] = doc['_id'] user['name'] = doc['name'] user['location'] = doc['location'] user['website'] = doc['website'] user['bio'] = doc['bio'] + doc[ 'site'] # 'site' is like a short biography if doc['photo'] != "" and doc['photo'] != None: user['profileImage'] = doc['photo'] else: user['profileImage'] = "" try: user['matched'] = doc['matched'] except: user['matched'] = None user['sourceCollection'] = FACEBOOK try: user['bornAt'] = parseDate( doc['birthdate'][:-10]) # remove Born and convert to date except: user['bornAt'] = None # Specials try: user['friends'] = list(map(lambda f: f[25:], doc['friends'])) except: user['friends'] = [] if doc['bg'] != "" and doc['bg'] != None: user['backgroundImage'] = doc['bg'] else: user['backgroundImage'] = "" user['education'] = "" if ('education' in doc and len(doc['education']) > 0): user['education'] += doc['education'][0] + "\n" if ('education1' in doc and len(doc['education1']) > 0): user['education'] += doc['education1'][0] + "\n" if ('education2' in doc and len(doc['education2']) > 0): user['education'] += doc['education2'][0] + "\n" user['education'] = user['education'].strip() user['work'] = "" if ('work' in doc and len(doc['work']) > 0): user['work'] += doc['work'][0] + "\n" if ('work1' in doc and len(doc['work1']) > 0): user['work'] += doc['work1'][0] + "\n" if ('work2' in doc and len(doc['work2']) > 0): user['work'] += doc['work2'][0] + "\n" user['work'] = user['work'].strip() return user
def appointment(author_id): user = getById(author_id) if(user["CurrentFunction"][0:3] != "app"): user["CurrentFunction"] = "app0" sendMessage("Would you like to make an appointment?", author_id) print(database) elif(user["CurrentFunction"][0:3] == "app"): print(database) if(int(user["CurrentFunction"][3:]) == 0): print(database) for x in getAllSubstrings(message.upper(),2): response = "" if x in ["YES"]: response="YES" break elif x in ["NO"]: response="NO" break if response.upper() == "YES": user["CurrentFunction"] = "app1" sendMessage("Please enter the date and time of your appointment",author_id) elif response.upper() =="NO": user["CurrentFunction"]="" sendMessage("Ok then!", author_id) print(database) else: sendMessage(answers["afirmativeQuestion"][language], author_id) elif(int(user["CurrentFunction"][3:]) == 1): global currentApp currentApp+=1 user["CurrentFunction"] = "app" + str(currentApp) appDate = parseDate(message) appDateString = appDate.strftime("%Y-%m-%d %H:%M:%S") appointment={ "uid": author_id, "appId": currentApp, "date": appDateString, "hospital": "" } appointments.append(appointment) sendMessage("Where will the appointment take place?", author_id) print(appointments) elif(int(user["CurrentFunction"][3:]) > 1): appId = int(user["CurrentFunction"][3:]) user["CurrentFunction"] = "" appointment=getAppById(appId) appointment["hospital"] = message sendMessage("Your appointment has been made on " + appointment["date"] + " for " + appointment["hospital"],author_id) print(appointment)
def startparse(xml): '''create parse tree and parse the list of starttimes [of the hourly time segments in the forecast]. see sample xml forecast at: http://forecast.weather.gov/MapClick.php?lat=40.357439&lon=-74.64922&FcstType=digitalDWML ''' tree = etree.parse(BytesIO(xml)) starttimes = [ parseDate(starttime) for starttime in tree.xpath('data/time-layout/start-valid-time/text()') ] return tree, starttimes
def __update_vol(self, vol, account): v,created = EbsVolume.objects.get_or_create(id = vol.id, account = account, size = vol.size) if vol.snapshot_id: v.snapshot = self.get_snapshot(vol.snapshot_id) v.region = vol.zone v.timestamp = parseDate(vol.create_time) v.state = vol.status v.save() self.vols.append(v.id) return v
def __update_snap(self, snap, account = None): nsnap, created = SnapShot.objects.get_or_create(id = snap.id, size = snap.volume_size) nsnap.account = account nsnap.owner_id = snap.owner_id nsnap.description = nsnap.description or '' nsnap.state = snap.status nsnap.timestamp = parseDate(snap.start_time) if account: nsnap.our = True nsnap.save() self.snaps.append(nsnap.id) return nsnap
def __processTwitterDoc(self, doc): '''Converts a mongo document into user dictionary for matcher. ''' user = {} if doc == None: return None # Commons user['username'] = doc['_id'] user['name'] = doc['name'] user['location'] = doc['location'] user['website'] = doc['site'] user['bio'] = doc['bio'] if doc['photo'] != "" and doc['photo'] != None: user['profileImage'] = doc['photo'] else: user['profileImage'] = "" try: user['matched'] = doc['matched'] except: user['matched'] = None user['sourceCollection'] = TWITTER try: user['bornAt'] = parseDate( doc['born'][5:]) # removes "Born " string except: user['bornAt'] = None # Specials user['username'] = doc['_id'] user['tweets'] = doc['tweets'] user['followers'] = doc['followerids'] if doc['joined'] != "": user['joinedAt'] = parseDate(doc['joined'][7:]) else: user['joinedAt'] = None return user
def _readRssFeed(self): filepath = mergePath(self.get_plugin_isntall_path()["path"], "rss.xml") if not os.path.exists(filepath): return [] with open(filepath, "rb") as f: xmldoc = parseString("\r\n".join(f.readlines())) itemlist = xmldoc.getElementsByTagName("item") items = [] for item in itemlist: itemdate = parseDate(get_xml_text(item.getElementsByTagName("pubDate")[0])).replace(tzinfo=None) rssitem = { "title": get_xml_text(item.getElementsByTagName("title")[0]), "link": get_xml_text(item.getElementsByTagName("link")[0]), "guid": get_xml_text(item.getElementsByTagName("guid")[0]), "pubDate": itemdate, } items.append(rssitem) return items
def get_timetable(url): r = session_requests.get(url, headers=get_page_headers) doc = fromstring(r.text) # print etree.tostring(doc, pretty_print=True) table = doc.get_element_by_id('MemberTimetable') gymbox_classes = [] tomorrow_classes = [] tomorrow = datetime.now().date() + timedelta(days=1) current_day = None # print etree.tostring(table, pretty_print=True) for child in list(table): if child.get('class') == 'dayHeader': current_day_string = child.text_content() prefix = '  ' if current_day_string.startswith(prefix): current_day_string = current_day_string[len(prefix):] current_day = parseDate( current_day_string.encode('ascii', 'ignore').decode('ascii')).date() if current_day == tomorrow: tomorrow_classes.append(child) if len(tomorrow_classes) > 0: # Remove the dayHeader row and the column titles row del tomorrow_classes[0] del tomorrow_classes[0] for child in list(tomorrow_classes): time = child.find_class('col0Item')[0].text_content() class_name = child.find_class('col1Item')[0].text_content() instructor = child.find_class('col3Item')[0].text_content() duration = child.find_class('col4Item')[0].text_content() id = child.find_class('col5Item')[0][0].get('id')[5:] gymbox_classes.append( GymboxClass(id, class_name, time, instructor, duration)) return gymbox_classes
def _readRssFeed(self): filepath = mergePath(self.get_plugin_isntall_path()['path'], 'rss.xml') if not os.path.exists(filepath): return [] with open(filepath, 'rb') as f: xmldoc = parseString("\r\n".join(f.readlines())) itemlist = xmldoc.getElementsByTagName('item') items = [] for item in itemlist: itemdate = parseDate( get_xml_text(item.getElementsByTagName('pubDate')[0])).replace( tzinfo=None) rssitem = { "title": get_xml_text(item.getElementsByTagName('title')[0]), "link": get_xml_text(item.getElementsByTagName('link')[0]), "guid": get_xml_text(item.getElementsByTagName('guid')[0]), "pubDate": itemdate } items.append(rssitem) return items
def updateReservation(self, res): account = AwsAccount.objects.get(id = res.owner_id) reg = res.region.name groups = [] instances = [] for g in res.groups: groups.append(SecurityGroup.objects.get(name = g.id, account = account)) for inst in res.instances: ami = AMI.objects.get(id = inst.image_id) dt = parseDate(inst.launch_time) key = KeyPair.objects.get(name = inst.key_name, account = account) ain, created = Instance.objects.get_or_create(id = inst.id, account = account, ami = ami, key_pair = key) ain.timestamp = dt ain.region = reg ain.state = inst.state ain.type = inst.instance_type ain.private_dns_name = inst.private_dns_name ain.public_dns_name = inst.public_dns_name ain.ip_address = inst.ip_address or '' ain.monitored = inst.monitored try: ebsblock = inst.block_device_mapping[inst.root_device_name] ain.volume = EbsVolume.objects.get(id = ebsblock.volume_id) ain.size = ain.volume.size ain.persistent = not ebsblock.delete_on_termination except: pass ain.security_groups.clear() for g in groups: ain.security_groups.add(g) ain.save() self.instances.append(ain.id)
def getLogLinesAsRssItems(self, repositoryId, maxlines): """Geeft RSS <item> representatie van loglines terug""" #print "Getting loglines for", repositoryId, maxlines buffer = '' lines = self._getUniqueLogLines(repositoryId, maxlines) #lines = self._tail(repositoryId, maxlines) if lines: burl, prfx = None, None for line in reversed(lines): lineparts = line.split(' ', 2) # Get baseUrl and metadataperfix from meta part only once: # Beware: We'll assume the latest warning has the most recent (and probably correct) repository settings. if burl is None: burl, prfx = self._getMetaPartStuff(lineparts[1]) oai_id = lineparts[1].split(':', 1)[1] rssData = { 'title': xmlEscape(oai_id), 'description': xmlEscape(lineparts[2]), 'identifier': xmlEscape(lineparts[1]), 'date': xmlEscape( str((parseDate(lineparts[0], ignoretz=True)).date())), 'link': xmlEscape( ('%s?verb=GetRecord&identifier=%s&metadataPrefix=%s') % (burl, oai_id, prfx)) } buffer += str(RSS_TEMPLATE % rssData) return buffer
def isDate(string): try: parseDate(string, fuzzy=True).date().isoformat() return True except: return False
def process_feed_item(feed_item, source, articles_in_memory, db): # check for duplicate in db if db.articles.find_one({'url': feed_item.link}) is not None: logger.debug('Skip: article already exists') return False # check if link exists already in memory if any(a['url'] == feed_item.link for a in articles_in_memory): logger.debug('Skip: article already exists') return False # parse article try: article = Article(feed_item.link, config=create_newspaper_config()) article.download() article.parse() except newspaper.article.ArticleException as exc: logger.debug(f'Newspaper error: {exc}') # logger.exception(exc) return False # check title article_title = article.title.strip() if not article_title: logger.debug('Skip: no title or text') return False # check text article_text = article.text.strip() if len(article_text) < MIN_TEXT_LENGTH: logger.debug('Skip: text too short') return False # must have date published_at_val = None if article.publish_date: # get from parsed article published_at_val = article.publish_date elif hasattr(feed_item, 'published'): # get from feed item published_at_val = feed_item.published if not published_at_val: logger.debug('Skip: missing date') return False # normalize date, create datetime object, remove time zone if isinstance(published_at_val, datetime): published_at = published_at_val.replace(tzinfo=None) elif isinstance(published_at_val, str): try: published_at = parseDate(published_at_val, ignoretz=True) except ParserError as exc: logger.debug(f'Dateutil parse error: {exc}') return False else: logger.debug('No valid date found') return False # date must be withing last n days difference = datetime.now() - published_at if difference.days > config.KEEP_DAYS: logger.debug( f'Skip: Article older than {config.KEEP_DAYS} days ({published_at})' ) return False # create mew item return { 'title': article_title, 'published_at': published_at, 'created_at': datetime.now(), 'url': feed_item.link, 'src': source['id'], 'text': article_text }
def overview_json(): # for head in request.headers: # print head, request.headers.get(head) auth = request.headers.get("Authorization") if not auth: return authenticate({ 'code': 'authorization_header_missing', 'description': 'Authorization header is expected' }) parts = auth.split() if parts[0].lower() != 'bearer': return { 'code': 'invalid_header', 'description': 'Authorization header must start with Bearer' } elif len(parts) == 1: return {'code': 'invalid_header', 'description': 'Token not found'} elif len(parts) > 2: return { 'code': 'invalid_header', 'description': 'Authorization header must be Bearer + \s + token' } token = parts[1] try: company_id = "1" conn = sqlite3.connect('seaborg_god.db') conn.row_factory = dict_factory c = conn.cursor() c.execute("SELECT * FROM companies WHERE id LIKE ?", (company_id)) company_info = c.fetchone() payload = jwt.decode(token, company_info['jwt_secret'] #audience=client_id ) except jwt.ExpiredSignature: return authenticate({ 'code': 'token_expired', 'description': 'token is expired' }) except jwt.InvalidAudienceError: return authenticate({ 'code': 'invalid_audience', 'description': 'incorrect audience, expected: ' + client_id }) except jwt.DecodeError: return authenticate({ 'code': 'token_invalid_signature', 'description': 'token signature is invalid' }) date_sent = parseDate(request.query.date_today) # print date_sent # print request.query.cid # print request.forms.get('cid') # print request.json # print "decoded", payload # # print "loading task id", item conn = sqlite3.connect('seaborg_god.db') conn.row_factory = dict_factory c = conn.cursor() c.execute( """SELECT i.name ,i.id ,i.outline ,i.creation_date ,i.approval_date ,i.rejection_date ,i.completion_date ,i.proposal_date ,i.created_by ,i.approved_by ,i.rejected_by ,i.completed_by ,i.proposed_by ,i.responsible ,i.reporting_to ,i.reporting_cycle ,i.progress_report_id ,i.final_report_id ,i.budget_id ,i.department_owner ,i.last_save ,i.deadline_date ,i.approval_requested ,e1.name AS created_name ,e2.name AS approved_name ,e3.name AS rejected_name ,e4.name AS completed_name ,e5.name AS proposed_name ,e6.name AS responsible_name ,e7.name AS reporting_to_name ,departments.title AS department_title ,departments.department_head AS department_head_id ,departments.parent_department AS parent_department_id ,depHeadPerson.name as department_approval_by FROM tasks i LEFT JOIN people e1 ON e1.id = i.created_by LEFT JOIN people e2 ON e2.id = i.approved_by LEFT JOIN people e3 ON e3.id = i.rejected_by LEFT JOIN people e4 ON e4.id = i.completed_by LEFT JOIN people e5 ON e5.id = i.proposed_by LEFT JOIN people e6 ON e6.id = i.responsible LEFT JOIN people e7 ON e7.id = i.reporting_to LEFT JOIN departments ON departments.id = i.department_owner LEFT JOIN people depHeadPerson ON departments.department_head = depHeadPerson.id WHERE i.company_id LIKE ? """, (str(payload['company_id']), )) tasks = c.fetchall() c.execute( "SELECT axioms.* FROM axioms, tasks WHERE axioms.task_id = tasks.id AND tasks.company_id LIKE ?", (str(payload['company_id']), )) axioms = c.fetchall() for axiom in axioms: if axiom['task_dependence_id']: print "the task has a depenendence" c.execute("SELECT name, deadline_date FROM tasks WHERE id = ?", (axiom['task_dependence_id'], )) axiom_task = c.fetchone() axiom['dependence'] = { "name": axiom_task['name'], "id": axiom['task_dependence_id'], "deadline_date": axiom_task['deadline_date'] } c.execute( "SELECT goals.* FROM goals, tasks WHERE goals.task_id = tasks.id AND tasks.company_id LIKE ?", (str(payload['company_id']), )) goals = c.fetchall() c.execute( "SELECT deliverables.* FROM deliverables, tasks WHERE deliverables.task_id = tasks.id AND tasks.company_id LIKE ?", (str(payload['company_id']), )) deliverables = c.fetchall() c.execute( "SELECT objectives.* FROM objectives, tasks WHERE objectives.task_id = tasks.id AND tasks.company_id LIKE ?", (str(payload['company_id']), )) objectives = c.fetchall() for task in tasks: objs = [obj for obj in axioms if obj['task_id'] == task['id']] task['axioms'] = objs objs = [obj for obj in goals if obj['task_id'] == task['id']] task['goals'] = objs objs = [obj for obj in deliverables if obj['task_id'] == task['id']] for obj in objs: if obj['isdelivered'] == 1: obj['isdelivered'] = True else: obj['isdelivered'] = False task['deliverables'] = objs objs = [obj for obj in objectives if obj['task_id'] == task['id']] task['objectives'] = objs if task['name'] == "": task['name'] = "Untitled task" task['isCompleted'] = task[ 'completion_date'] is not None and task['completion_date'] != "" task['isApproved'] = task[ 'approval_date'] is not None and task['approval_date'] != "" task['isRejected'] = task[ 'rejection_date'] is not None and task['rejection_date'] != "" task['isPendingApproval'] = task[ 'approval_requested'] is not None and task[ 'approval_requested'] != "" task['isCreator'] = payload['id'] == task['created_by'] task['isApprover'] = payload['id'] == task['department_head_id'] task['isAssignee'] = payload['id'] == task['responsible'] task['isRaportingOfficer'] = payload['id'] == task['reporting_to'] task['deadlineDue'] = (parseDate(task['deadline_date']) - parseDate(request.query.date_today)).days # Task state if task['isCompleted']: task['state'] = 'isCompleted' elif task['isApproved']: task['state'] = 'isApproved' elif task['isRejected']: task['state'] = 'isRejected' elif task['isPendingApproval']: task['state'] = 'isPendingApproval' else: task['state'] = 'isDraft' c.execute( "SELECT company_name, owner as company_owner_id, url_name as company_url FROM companies WHERE id = ?", (str(payload['company_id']), )) company_info = c.fetchone() conn.close() overview = {"tasks": tasks, "company": company_info} return overview
def run(self): self.train_meta_data = TransactionMetadata() self.train_meta_data.setFromDict(self.transaction.persistent_model_metadata.train_metadata) header = self.transaction.input_data.columns origData = {} for column in header: origData[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.logging.info('population_size={population_size}, sample_size={sample_size} {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = tryCastToNumber(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: origData[column].append(value) column_count[column] += 1 stats = {} for i, col_name in enumerate(origData): col_data = origData[col_name] # all rows in just one column data_type = self.getColumnDataType(col_data) # NOTE: Enable this if you want to assume that some numeric values can be text # We noticed that by default this should not be the behavior # TODO: Evaluate if we want to specify the problem type on predict statement as regression or classification # # if col_name in self.train_meta_data.model_predict_columns and data_type == DATA_TYPES.NUMERIC: # unique_count = len(set(col_data)) # if unique_count <= CONFIG.ASSUME_NUMERIC_AS_TEXT_WHEN_UNIQUES_IS_LESS_THAN: # data_type = DATA_TYPES.TEXT if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']: col_data[i] = None else: try: col_data[i] = int(parseDate(element).timestamp()) except: logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [cleanfloat(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() xp = [] if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) inc_rate = 0.05 initial_step_size = abs(max_value-min_value)/100 xp += [min_value] i = min_value + initial_step_size while i < max_value: xp += [i] i_inc = abs(i-min_value)*inc_rate i = i + i_inc # TODO: Solve inc_rate for N # min*inx_rate + (min+min*inc_rate)*inc_rate + (min+(min+min*inc_rate)*inc_rate)*inc_rate .... # # x_0 = 0 # x_i = (min+x_(i-1)) * inc_rate = min*inc_rate + x_(i-1)*inc_rate # # sum of x_i_{i=1}^n (x_i) = max_value = inc_rate ( n * min + sum(x_(i-1)) ) # # mx_value/inc_rate = n*min + inc_rate ( n * min + sum(x_(i-2)) ) # # mx_value = n*min*in_rate + inc_rate^2*n*min + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*sum(x_(i-2)) # = n*min(inc_rate+inc_rate^2) + inc_rate^2*(inc_rate ( n * min + sum(x_(i-3)) )) # = n*min(sum_(i=1)^(i=n)(inc_rate^i)) # => sum_(i=1)^(i=n)(inc_rate^i)) = max_value/(n*min(sum_(i=1)^(i=n)) # # # i + i*x else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 xp = [] is_float = True if max([1 if int(i) != i else 0 for i in col_data]) == 1 else False col_stats = { "column": col_name, KEYS.DATA_TYPE: data_type, # "distribution": best_fit_name, # "distributionParams": distribution_params, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "max": max_value, "min": min_value, "is_float": is_float, "histogram": { "x": x, "y": y }, "percentage_buckets": xp } stats[col_name] = col_stats # else if its text else: # see if its a sentence or a word is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False dictionary, histogram = self.getWordsDictionary(col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len( dictionary) / len(col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False: dictionary = [] dictionary_available = False col_stats = { "column": col_name, KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "histogram": histogram } stats[col_name] = col_stats total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.persistent_model_metadata.column_stats = stats self.transaction.persistent_model_metadata.total_row_count = total_rows self.transaction.persistent_model_metadata.test_row_count = test_rows self.transaction.persistent_model_metadata.train_row_count = train_rows self.transaction.persistent_model_metadata.validation_row_count = validation_rows self.transaction.persistent_model_metadata.update() return stats
def _getTopItem(self, lxmlNode): ## Wrappers: pid, modified, mimetype, pidlocation = '', '', "application/xml", '' #1: Get persistentIdentifier: pidlist = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()', namespaces=self._nsMap) if len(pidlist) > 0: pid = pidlist[0].strip() if not comm.isURNNBN(pid): raise ValidateException( formatExceptionLine(EXCEPTION0 + pid, prefix=STR_DIDL)) else: raise ValidateException( formatExceptionLine(EXCEPTION1, prefix=STR_DIDL)) #2: Get toplevel modificationDate: comm.isISO8601() tl_modified = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) ## Check op geldig/aanwezigheid tlModified, anders exception: if len(tl_modified) > 0 and not comm.isISO8601(tl_modified[0]): raise ValidateException( formatExceptionLine(EXCEPTION2 + tl_modified[0], prefix=STR_DIDL)) elif len(tl_modified) == 0: raise ValidateException( formatExceptionLine(EXCEPTION3, prefix=STR_DIDL)) ## Get all modified dates: all_modified = lxmlNode.xpath( '//didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) ## Get most recent date from all items, to add to toplevelItem: if len(all_modified) > 0: datedict = {} for date in all_modified: if comm.isISO8601(date.strip()): #datedict[parseDate(date.strip())] = date.strip() pd = parseDate(date.strip()) datedict["%s %s" % (str(pd.date()), str(pd.time()))] = date.strip() ## Get first sorted key: for key in reversed(sorted(datedict.iterkeys())): modified = datedict[key] break if not tl_modified[0].strip() == modified: self.do.logMsg(self._uploadid, LOGGER1, prefix=STR_DIDL) #3: Get PidResourceMimetype mimetypelist = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@mimeType', namespaces=self._nsMap) if len(mimetypelist) > 0: mimetype = mimetypelist[0].strip() if not comm.isMimeType(mimetype): self.do.logMsg(self._uploadid, LOGGER2 + mimetype, prefix=STR_DIDL) #4: Get PidResourceLocation: pidlocation = self._findAndBindFirst( lxmlNode, '%s', '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@ref', '//didl:DIDL/didl:Item/didl:Component/didl:Resource/text()' '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #DIDL 3.0 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #DIDL 3.0, without @rdf:resource '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #fallback DIDL 2.3.1 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref', #fallback DIDL 3.0 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref', #fallback DIDL 3.0, without @rdf:resource '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref' #fallback DIDL 2.3.1 ).strip() if pidlocation == '': raise ValidateException( formatExceptionLine(EXCEPTION4, prefix=STR_DIDL)) if not comm.isURL(pidlocation): raise ValidateException( formatExceptionLine(EXCEPTION5 + pidlocation, prefix=STR_DIDL)) return """<didl:Item> <didl:Descriptor><didl:Statement mimeType="application/xml"><dii:Identifier>%s</dii:Identifier></didl:Statement></didl:Descriptor> <didl:Descriptor><didl:Statement mimeType="application/xml"><dcterms:modified>%s</dcterms:modified></didl:Statement></didl:Descriptor> <didl:Component><didl:Resource mimeType="%s" ref="%s"/></didl:Component>""" % ( escapeXml(pid), modified, escapeXml(mimetype), comm.urlQuote(pidlocation))
def run(self): header = self.transaction.input_data.columns origData = {} for column in header: origData[column] = [] empty_count = {} column_count = {} # we dont need to generate statistic over all of the data, so we subsample, based on our accepted margin of error population_size = len(self.transaction.input_data.data_array) sample_size = int(sampleSize(population_size=population_size, margin_error=CONFIG.DEFAULT_MARGIN_OF_ERROR, confidence_level=CONFIG.DEFAULT_CONFIDENCE_LEVEL)) # get the indexes of randomly selected rows given the population size input_data_sample_indexes = random.sample(range(population_size), sample_size) self.logging.info('population_size={population_size}, sample_size={sample_size} {percent:.2f}%'.format(population_size=population_size, sample_size=sample_size, percent=(sample_size/population_size)*100)) for sample_i in input_data_sample_indexes: row = self.transaction.input_data.data_array[sample_i] for i, val in enumerate(row): column = header[i] value = self.cast(val) if not column in empty_count: empty_count[column] = 0 column_count[column] = 0 if value == None: empty_count[column] += 1 else: origData[column].append(value) column_count[column] += 1 stats = {} for i, col_name in enumerate(origData): col_data = origData[col_name] # all rows in just one column data_type = self.getColumnDataType(col_data) if data_type == DATA_TYPES.DATE: for i, element in enumerate(col_data): if str(element) in [str(''), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']: col_data[i] = None else: try: col_data[i] = int(parseDate(element).timestamp()) except: logging.warning('Could not convert string to date and it was expected, current value {value}'.format(value=element)) col_data[i] = None if data_type == DATA_TYPES.NUMERIC or data_type == DATA_TYPES.DATE: newData = [] for value in col_data: if value != '' and value != '\r' and value != '\n': newData.append(value) col_data = [float(i) for i in newData if str(i) not in ['', str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA']] y, x = np.histogram(col_data, 50, density=False) x = (x + np.roll(x, -1))[:-1] / 2.0 x = x.tolist() y = y.tolist() if len(col_data) > 0: max_value = max(col_data) min_value = min(col_data) mean = np.mean(col_data) median = np.median(col_data) var = np.var(col_data) skew = st.skew(col_data) kurtosis = st.kurtosis(col_data) else: max_value = 0 min_value = 0 mean = 0 median = 0 var = 0 skew = 0 kurtosis = 0 col_stats = { "column": col_name, KEYS.DATA_TYPE: data_type, # "distribution": best_fit_name, # "distributionParams": distribution_params, "mean": mean, "median": median, "variance": var, "skewness": skew, "kurtosis": kurtosis, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "max": max_value, "min": min_value, "histogram": { "x": x, "y": y } } stats[col_name] = col_stats # else if its text else: # see if its a sentence or a word is_full_text = True if data_type == DATA_TYPES.FULL_TEXT else False dictionary, histogram = self.getWordsDictionary(col_data, is_full_text) # if no words, then no dictionary if len(col_data) == 0: dictionary_available = False dictionary_lenght_percentage = 0 dictionary = [] else: dictionary_available = True dictionary_lenght_percentage = len( dictionary) / len(col_data) * 100 # if the number of uniques is too large then treat is a text if dictionary_lenght_percentage > 10 and len(col_data) > 50 and is_full_text==False: dictionary = [] dictionary_available = False col_stats = { "column": col_name, KEYS.DATA_TYPE: DATA_TYPES.FULL_TEXT if is_full_text else data_type, "dictionary": dictionary, "dictionaryAvailable": dictionary_available, "dictionaryLenghtPercentage": dictionary_lenght_percentage, "emptyColumns": empty_count[col_name], "emptyPercentage": empty_count[col_name] / column_count[col_name] * 100, "histogram": histogram } stats[col_name] = col_stats total_rows = len(self.transaction.input_data.data_array) test_rows = len(self.transaction.input_data.test_indexes) validation_rows = len(self.transaction.input_data.validation_indexes) train_rows = len(self.transaction.input_data.train_indexes) self.transaction.persistent_model_metadata.column_stats = stats self.transaction.persistent_model_metadata.total_row_count = total_rows self.transaction.persistent_model_metadata.test_row_count = test_rows self.transaction.persistent_model_metadata.train_row_count = train_rows self.transaction.persistent_model_metadata.validation_row_count = validation_rows self.transaction.persistent_model_metadata.update() return stats
def isISO8601(datestring): try: parseDate(datestring) except ValueError: return False return True
def main(): env = getGHEnv() gh = env["GITHUB"] inpV = env["INPUT"] e = json.loads(gh["EVENT_PATH"].read_text()) #pprint(e) i = e["issue"] id = i["id"] no = i["number"] b = i["body"] l = i["locked"] c = parseDate(i["created_at"]) up = parseDate(i["updated_at"]) u = i["user"] r = e["repository"] rn = r["name"] ro = r["owner"] rol = ro["login"] lblz = {lbl["name"] for lbl in i["labels"]} #print(e["action"], "c", c, "up", up, u["login"], i["state"], lblz) ksyStub, otherMetadata = parseHeaders(b) parser = YAML(typ="safe") illF = " contains ill-formed YAML" if ksyStub: try: ksyStub = parser.load(ksyStub) except: ksyStub = None ksyStubIssues = ["KSY stub" + illF] if ksyStub: ksyStubIssues = lintKSYStub(ksyStub) else: ksyStubIssues = [ "KSY stub (`meta` + `doc` + `doc-ref` must be present) is missing" ] if otherMetadata: try: otherMetadata = parser.load(otherMetadata) except: otherMetadata = None additionalBlockIssues = ["Additional block" + illF] additionalBlockIssues = lintAdditionalBlock(otherMetadata) else: additionalBlockIssues = () api = GHAPI(inpV["GITHUB_TOKEN"]) repO = api.repo(rol, rn) issueO = repO.issue(no) if ksyStubIssues or additionalBlockIssues: lblzMustBe = (lblz | {invalidLabel}) - {validLabel} if invalidLabel not in lblz: issueO.leaveAComment( generateIssuesMessage( "Hi. Thank you for leaving the request. Please, fix the following issues in it:", ksyStubIssues, additionalBlockIssues)) else: # todo: parse the issues and diff them issueO.leaveAComment( generateIssuesMessage("Some issues are still present:", ksyStubIssues, additionalBlockIssues)) else: lblzMustBe = (lblz | {validLabel}) - {invalidLabel} if invalidLabel in lblz or validLabel not in lblz: print("commenting") issueO.leaveAComment( "The issues that are detected by the linter have been fixed. Thank you." ) print("commented") else: pass # everything is OK if lblzMustBe != lblz: print("Fixing labels") issueO.setLabels(lblzMustBe) print("Fixed labels")
def toDate(dateString): try: return parseDate(dateString) except: # return epoch on failure return datetime.fromtimestamp(0)
minima[dataIndex] = min(minima[dataIndex], maxOnline) maxima[dataIndex] = max(maxima[dataIndex], maxOnline) average[dataIndex] += maxOnline averageCount[dataIndex] += 1 dataIndex += 1 if dataIndex == 7 * 24: dataIndex = 0 if dataIndex == _dataIndex: times = times - 1 with open(sys.argv[1], "r") as log: line = log.readline()[:-1].split(" ") timestamp = parseDate(line[0]) lastTimestamp = timestamp timeBorder = roundTime(timestamp, True) dataIndex = timestamp.weekday() * 24 + timestamp.hour if line[1] != "RESET": print "WARN: First line is not a RESET line" for line in log: line = line[:-1].split(" ") timestamp = parseDate(line[0]) if line[1] == "RESET": online = 0 elif line[1] == "+": online += 1
def norm(value, cell_stats): if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.NUMERIC: if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or ( value == None or value == '' or value == '\n' or value == '\r')): return [0, 0, 0] if cell_stats['max'] - cell_stats['min'] != 0: normalizedValue = (value - cell_stats['min']) / \ (cell_stats['max'] - cell_stats['min']) elif cell_stats['max'] != 0: normalizedValue = value / cell_stats['max'] else: normalizedValue = value # if normalizedValue > 10: # raise ValueError('Something is wrong with normalized value') sign = 1 if normalizedValue >= 0 else 0 normalizedValue = abs(normalizedValue) + OFFSET return [normalizedValue, sign, 1.0] if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.DATE: #[ timestamp, year, month, day, minute, second, is null] if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or ( value == None or value == '' or value == '\n' or value == '\r')): ret = [0]*7 ret[-1] = 0 return ret try: timestamp = int(parseDate(value).timestamp()) except: ret = [0] * 7 ret[-1] = 0 return ret date = datetime.datetime.fromtimestamp(timestamp) date_max = datetime.datetime.fromtimestamp(cell_stats['max']) date_min = datetime.datetime.fromtimestamp(cell_stats['min']) attrs = ['year', 'month', 'day', 'minute', 'second'] maxes = {'day': 31, 'minute': 60, 'second': 60, 'month': 12} norm_vals = [] if cell_stats['max'] - cell_stats['min'] != 0: norm_vals.append( (timestamp - cell_stats['min']) / (cell_stats['max'] - cell_stats['min']) ) else: norm_vals.append( timestamp / cell_stats['max'] ) for k_attr in attrs: curr = getattr(date, k_attr) if k_attr in maxes: d_max = maxes[k_attr] d_min = 0 else: d_max = getattr(date_max, k_attr) d_min = getattr(date_min, k_attr) if d_max - d_min !=0: norm_vals.append( (curr -d_min)/(d_max-d_min) ) else: norm_vals.append((curr) / (d_max)) norm_vals.append(1.0) return norm_vals if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.TEXT: # is it a word if cell_stats['dictionaryAvailable']: # all the words in the dictionary +2 (one for rare words and one for null) vector_length = len(cell_stats['dictionary']) + TEXT_ENCODING_EXTRA_LENGTH arr = [0] * vector_length arr[-1] = 1.0 if value in [None, '']: # return NULL value, which is an empy hot vector array with the last item in list with value 1 arr[vector_length - 1] = 0 # set null as 1 return arr # else return one hot vector # if word is a strange word it will not be in the dictionary try: index = cell_stats['dictionary'].index(value) except: index = vector_length - 2 arr[index] = 1 return arr else: return [] if cell_stats[KEYS.DATA_TYPE] == DATA_TYPES.FULL_TEXT: if (str(value) in [str(''), str(' '), str(None), str(False), str(np.nan), 'NaN', 'nan', 'NA'] or ( value == None or value == '' or value == '\n' or value == '\r')): return [FULL_TEXT_NONE_VALUE] # is it a full text if cell_stats['dictionaryAvailable']: # all the words in the dictionary +2 (one for rare words and one for null) vector_length = len(cell_stats['dictionary']) + FULL_TEXT_ENCODING_EXTRA_LENGTH # else return a list of one hot vectors values = splitRecursive(value, WORD_SEPARATORS) array_of_arrays = [] first_word = vector_length - 4 array_of_arrays += [FULL_TEXT_IS_START] for word in values: # else return one hot vector # if word is a strange word it will not be in the dictionary try: index = cell_stats['dictionary'].index(word) except: index = FULL_TEXT_UN_FREQUENT array_of_arrays += [index] array_of_arrays += [FULL_TEXT_IS_END] # return [array_of_arrays] # TODO: ask about this return array_of_arrays else: return []