def contests(): """ Show the upcoming contests """ today = datetime.datetime.today() today = datetime.datetime.strptime(str(today)[:-7], "%Y-%m-%d %H:%M:%S") start_date = today.date() end_date = start_date + datetime.timedelta(90) url = "https://contesttrackerapi.herokuapp.com/" response = requests.get(url) if response.status_code == 200: response = response.json()["result"] else: # @todo: something better return dict() ongoing = response["ongoing"] upcoming = response["upcoming"] contests = [] cal = pdt.Calendar() table = TABLE(_class="centered striped") thead = THEAD( TR(TH("Contest Name"), TH("Site"), TH("Start"), TH("Duration/Ending"), TH("Link"), TH("Add Reminder"))) table.append(thead) tbody = TBODY() for i in ongoing: if i["Platform"] in ("TOPCODER", "OTHER"): continue try: endtime = datetime.datetime.strptime(i["EndTime"], "%a, %d %b %Y %H:%M") except ValueError: continue tr = TR() span = SPAN(_class="green tooltipped", data={"position": "right", "delay": "50", "tooltip": "Live Contest"}, _style="cursor: pointer; " + \ "float:right; " + \ "height:10px; " + \ "width:10px; " + \ "border-radius: 50%;") tr.append(TD(i["Name"], span)) tr.append(TD(i["Platform"].capitalize())) tr.append(TD("-")) tr.append(TD(str(endtime).replace("-", "/"), _class="contest-end-time")) tr.append( TD( A(I(_class="fa fa-external-link-square fa-lg"), _class="btn-floating btn-small green accent-4 tooltipped", _href=i["url"], data={ "position": "left", "tooltip": "Contest Link", "delay": "50" }, _target="_blank"))) tr.append( TD( BUTTON( I(_class="fa fa-calendar-plus-o"), _class= "btn-floating btn-small orange accent-4 tooltipped disabled", data={ "position": "left", "tooltip": "Already started!", "delay": "50" }))) tbody.append(tr) # This id is used for uniquely identifying # a particular contest in js button_id = 1 for i in upcoming: if i["Platform"] in ("TOPCODER", "OTHER"): continue start_time = datetime.datetime.strptime(i["StartTime"], "%a, %d %b %Y %H:%M") tr = TR(_id="contest-" + str(button_id)) tr.append(TD(i["Name"])) tr.append(TD(i["Platform"].capitalize())) tr.append(TD(str(start_time))) duration = i["Duration"] duration = duration.replace(" days", "d") duration = duration.replace(" day", "d") tr.append(TD(duration)) tr.append( TD( A(I(_class="fa fa-external-link-square fa-lg"), _class="btn-floating btn-small green accent-4 tooltipped", _href=i["url"], data={ "position": "left", "tooltip": "Contest Link", "delay": "50" }, _target="_blank"))) tr.append( TD( BUTTON( I(_class="fa fa-calendar-plus-o"), _class="btn-floating btn-small orange accent-4 tooltipped", data={ "position": "left", "tooltip": "Set Reminder to Google Calendar", "delay": "50" }, _id="set-reminder-" + str(button_id)))) tbody.append(tr) button_id += 1 table.append(tbody) return dict(table=table, upcoming=upcoming)
def __init__(self): self._db = DatabaseManager() self._pdt = parsedatetime.Calendar(parsedatetime.Constants( self.LOCALE))
def conv_dt(dt_string): cal = parsedatetime.Calendar() dt_obj, _ = cal.parseDT(datetimeString=dt_string) return dt_obj
def process_message(config, data, event, context): logger = logging.getLogger('pubsub2inbox') # Ignore messages submitted before our retry period retry_period = '2 days ago' if 'retryPeriod' in config: retry_period = config['retryPeriod'] retry_period_parsed = parsedatetime.Calendar().parse(retry_period) if len(retry_period_parsed) > 1: retry_earliest = datetime.fromtimestamp(mktime(retry_period_parsed[0]), timezone.utc) else: retry_earliest = datetime.fromtimestamp(mktime(retry_period_parsed), timezone.utc) message_time = parser.parse(context.timestamp) if (message_time - retry_earliest) < timedelta(0, 0): logger.warning('Ignoring message because it\'s past the retry period.', extra={ 'event_id': context.event_id, 'retry_period': retry_period, 'retry_earliest': retry_earliest.strftime('%c'), 'event_timestamp': message_time }) raise MessageTooOldException( 'Ignoring message because it\'s past the retry period.') template_variables = { 'data': data, 'event': event, 'context': context, } jinja_environment = get_jinja_environment() if 'processors' in config: for processor in config['processors']: logger.debug('Processing message using input processor: %s' % processor) mod = __import__('processors.%s' % processor) processor_module = getattr(mod, processor) processor_class = getattr(processor_module, '%sProcessor' % processor.capitalize()) processor_instance = processor_class(config, jinja_environment, data, event, context) processor_variables = processor_instance.process() template_variables.update(processor_variables) jinja_environment.globals = { **jinja_environment.globals, **template_variables } if 'processIf' in config: processif_template = jinja_environment.from_string(config['processIf']) processif_template.name = 'processif' processif_contents = processif_template.render() if processif_contents.strip() == '': logger.info( 'Will not send message because processIf evaluated to empty.') return if 'resendBucket' in config: if 'resendPeriod' not in config: raise NoResendConfigException( 'No resendPeriod configured, even though resendBucket is set!') resend_key_hash = hashlib.sha256() if 'resendKey' not in config: default_resend_key = template_variables.copy() default_resend_key.pop('context') resend_key_hash.update( json.dumps(default_resend_key).encode('utf-8')) else: key_template = jinja_environment.from_string(config['resendKey']) key_template.name = 'resend' key_contents = key_template.render() resend_key_hash.update(key_contents.encode('utf-8')) resend_file = resend_key_hash.hexdigest() logger.debug('Checking for resend object in bucket...', extra={ 'bucket': config['resendBucket'], 'blob': resend_file }) client_info = grpc_client_info.ClientInfo( user_agent='google-pso-tool/pubsub2inbox/1.1.0') storage_client = storage.Client(client_info=client_info) bucket = storage_client.bucket(config['resendBucket']) resend_blob = bucket.blob(resend_file) if resend_blob.exists(): resend_blob.reload() resend_period = config['resendPeriod'] resend_period_parsed = parsedatetime.Calendar().parse( resend_period, sourceTime=resend_blob.time_created) if len(resend_period_parsed) > 1: resend_earliest = datetime.fromtimestamp( mktime(resend_period_parsed[0])) else: resend_earliest = datetime.fromtimestamp( mktime(resend_period_parsed)) if datetime.now() >= resend_earliest: logger.debug('Resending the message now.', extra={ 'resend_earliest': resend_earliest, 'blob_time_created': resend_blob.time_created }) resend_blob.upload_from_string('') else: logger.info( 'Can\'t resend the message now, resend period not elapsed.', extra={ 'resend_earliest': resend_earliest, 'blob_time_created': resend_blob.time_created }) return else: try: resend_blob.upload_from_string('', if_generation_match=0) except Exception as exc: # Handle TOCTOU condition if 'conditionNotMet' in str(exc): logger.warning( 'Message (re)sending already in progress (resend key already exist).', extra={'exception': exc}) return else: raise exc return if 'outputs' in config: for output_config in config['outputs']: if 'type' not in output_config: raise NoTypeConfiguredException( 'No type configured for output!') if 'processIf' in output_config: processif_template = jinja_environment.from_string( output_config['processIf']) processif_template.name = 'processif' processif_contents = processif_template.render() if processif_contents.strip() == '': logger.info( 'Will not use output processor %s because processIf evaluated to empty.' % output_config['type']) continue logger.debug('Processing message using output processor: %s' % output_config['type']) output_type = output_config['type'] mod = __import__('output.%s' % output_type) output_module = getattr(mod, output_type) output_class = getattr(output_module, '%sOutput' % output_type.capitalize()) output_instance = output_class(config, output_config, jinja_environment, data, event, context) try: output_instance.output() except Exception as exc: logger.error('Output processor %s failed, trying next...' % (output_type), extra={'exception': traceback.format_exc()}) if 'allOutputsMustSucceed' in config and config[ 'allOutputsMustSucceed']: raise exc else: raise NoOutputsConfiguredException('No outputs configured!')
def __init__(self, date_format=None, **kwargs): super(Date, self).__init__(**kwargs) self.date_format = date_format self.parser = parsedatetime.Calendar( version=parsedatetime.VERSION_CONTEXT_STYLE)
def __call__(self, args): try: os.makedirs(args.directory) except: pass since = None if args.since: since = parsedatetime.Calendar().parse(args.since) since = time.mktime(since[0]) since = datetime.datetime.fromtimestamp(since) if args.labels: font = ImageFont.truetype("arial.ttf", 14) else: font = None workers = session.query(turkic.models.Worker) for worker in workers: print "Sampling worker {0}".format(worker.id) jobs = session.query(Job) jobs = jobs.filter(Job.worker == worker) jobs = jobs.join(Segment) jobs = jobs.join(Video) jobs = jobs.filter(Video.isfortraining == False) if since: jobs = jobs.filter(turkic.models.HIT.timeonserver >= since) jobs = jobs.order_by(sqlalchemy.func.rand()) jobs = jobs.limit(args.number) for job in jobs: print "Visualizing HIT {0}".format(job.hitid) paths = [x.getboxes(interpolate = True, bind = True, label = True) for x in job.paths] if args.frames > job.segment.stop - job.segment.start: frames = range(job.segment.start, job.segment.stop + 1) else: frames = random.sample(xrange(job.segment.start, job.segment.stop + 1), args.frames) size = math.sqrt(len(frames)) video = job.segment.video bannersize = (video.width * int(math.floor(size)), video.height * int(math.ceil(size))) image = Image.new(video[0].mode, bannersize) size = int(math.floor(size)) offset = (0, 0) horcount = 0 paths = vision.visualize.highlight_paths(video, paths, font = font) for frame, framenum in paths: if framenum in frames: image.paste(frame, offset) horcount += 1 if horcount >= size: offset = (0, offset[1] + video.height) horcount = 0 else: offset = (offset[0] + video.width, offset[1]) image.save("{0}/{1}-{2}.jpg".format(args.directory, worker.id, job.hitid))
# limitations under the License. """For processing BigQuery timestamp dates.""" import datetime import logging import re import parsedatetime # Turn off the chatty parsedatetime module's logging logging.getLogger('parsedatetime').setLevel(logging.ERROR) # Initialize parsedatetime pdt_constants = parsedatetime.Constants() pdt_constants.BirthdayEpoch = 50 # TODO(user) provide a way to set this. pdt = parsedatetime.Calendar(pdt_constants) # e.g. 1989-10-02 05:23:48 1958-06-24T12:18:35.5803 1988-08-15T19:06:56.235 TIMESTAMP_RE = re.compile(r'^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])' r'[ T]([01]\d|2[0-3]):[0-5]\d' r':([0-5]\d|60)(\.\d{3,4})?' # Leap seconds! r'( [+-][012]\d:[0-5]\d)?$') # YYYY-MM-DD HH:MM:SS.micro +08:00 OUTPUT_TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S.%f %z' INPUT_TIMESTAMP_FORMATS = ( '%Y-%m-%d %H:%M:%S.%f %z', '%Y/%m/%d %H:%M:%S.%f %z', '%m/%d/%Y %H:%M:%S.%f %z', '%m/%d/%y %H:%M:%S.%f %z',
async def mute(self, ctx, *, member = None, cooldown = None): """Mencegah member untuk mengirim pesan dalam chat atau berbicara dalam voice channel(bot-admin/admin only).""" # isOwner = self.settings.isOwner(ctx.author) # if isOwner == False: # await ctx.send("Fitur ini sedang dalam tahap ujicoba") # return if not await Utils.is_bot_admin_reply(ctx): return if member == None: em = discord.Embed(color = 0XFF8C00, description = "Mencegah member untuk mengirim pesan dalam chat atau berbicara dalam voice channel\n\n" "**Panduan**\n" "`{}mute [member] [cooldown]`" .format(ctx.prefix)) em.set_footer(text = "{}#{}".format(ctx.author.name, ctx.author.discriminator), icon_url = f"{ctx.author.avatar_url}") return await ctx.send(embed=em) # Let's search for a name at the beginning - and a time at the end parts = member.split() for j in range(len(parts)): # Reverse search direction i = len(parts)-1-j memFromName = None endTime = None # Name = 0 up to i joined by space nameStr = ' '.join(parts[0:i+1]) # Time = end of name -> end of parts joined by space timeStr = ' '.join(parts[i+1:]) memFromName = DisplayName.memberForName(nameStr, ctx.guild) if memFromName: # We got a member - let's check for time # Get current time - and end time try: # Get current time - and end time currentTime = int(time.time()) cal = parsedatetime.Calendar() time_struct, parse_status = cal.parse(timeStr) start = datetime(*time_struct[:6]) end = time.mktime(start.timetuple()) # Get the time from now to end time endTime = end-currentTime except: pass if not endTime == None: # We got a member and a time - break break if memFromName == None: # We couldn't find one or the other em = discord.Embed(color = 0XFF8C00, description = "> Mencegah member untuk mengirim pesan dalam chat atau berbicara dalam voice channel\n> \n" "> **Panduan**\n" "> `{}mute [member] [cooldown]`" .format(ctx.prefix)) em.set_author(name = "Oops!", url = "https://acinonyxesports.com/", icon_url = "https://cdn.discordapp.com/attachments/518118753226063887/725569194304733435/photo.jpg") em.set_footer(text = f"Request By : {ctx.author.name}", icon_url = f"{ctx.author.avatar_url}") msg = 'Usage: `{}mute [member] [cooldown]`'.format(ctx.prefix) return await ctx.send(msg) cooldown = None if endTime == 0 else endTime member = memFromName # Check if we're muting ourself if member is ctx.author: msg = 'Akan lebih mudah untuk ku kalo kamu sendiri yang diam!' em = discord.Embed(color = 0XFF8C00, description = msg) em.set_footer(text = "{}".format(ctx.author), icon_url = f"{ctx.author.avatar_url}") return await ctx.send(embed = em) # Check if we're muting the bot if member.id == self.bot.user.id: msg = '┐( ̄ヘ ̄;)┌\nAku nggk mau mute diri sendiri.' em = discord.Embed(color = 0XFF8C00, description = msg) em.set_footer(text = "{}".format(ctx.author), icon_url = f"{ctx.author.avatar_url}") return await ctx.send(embed = em) # Check if member is admin or bot admin if await Utils.is_bot_admin_reply(ctx,member=member,message="┐( ̄ヘ ̄;)┌\nKamu tidak dapat melakukan mute pada admin lain.",message_when=True): return # Set cooldown - or clear it if type(cooldown) is int or type(cooldown) is float: if cooldown < 0: msg = '┐( ̄ヘ ̄;)┌\nCooldown tidak dapat dilakukan dengan angka negatif!' em = discord.Embed(color = 0XFF8C00, description = msg) em.set_footer(text = "{}#{}".format(ctx.author), icon_url = f"{ctx.author.avatar_url}") return await ctx.send(embed = em) currentTime = int(time.time()) cooldownFinal = currentTime+cooldown else: cooldownFinal = None # Check if we're using the old mute and suggest the quicker version try: role = ctx.guild.get_role(int(self.settings.getServerStat(ctx.guild,"MuteRole"))) except: role = None msg = "Muting...{}".format( "" if role else "\n\nKamu dapat membuat memilih role dengan cara mengetik `{}setmuterole [role]`\nAtau `{}createmuterole [role_name]` untuk mute member ***lebih cepat***.".format(ctx.prefix,ctx.prefix) ) em = discord.Embed(color = 0XFF8C00, description = msg) em.set_footer(text = "{}".format(ctx.author), icon_url = "{}".format(ctx.author.avatar_url)) mess = await ctx.send(embed = em) # Do the actual muting await self._mute(member, ctx.guild, cooldownFinal, ctx.author) if cooldown: mins = "menit" checkRead = ReadableTime.getReadableTimeBetween(currentTime, cooldownFinal) msg = '*{}* telah di **Mute** hingga *{}*.'.format(DisplayName.name(member), checkRead) # pm = 'You have been **Muted** by *{}* for *{}*.\n\nYou will not be able to send messages on *{}* until either that time has passed, or you have been **Unmuted**.'.format(DisplayName.name(ctx.author), checkRead, Utils.suppressed(ctx, ctx.guild.name)) else: msg = '*{}* telah di **Mute** *hingga pemberitahuan lebih lanjut*.'.format(DisplayName.name(member)) # pm = 'You have been **Muted** by *{}* *until further notice*.\n\nYou will not be able to send messages on *{}* until you have been **Unmuted**.'.format(DisplayName.name(ctx.author), Utils.suppressed(ctx, ctx.guild.name)) await mess.edit(content=Utils.suppressed(ctx,msg)) '''try:
def run(): # print("Test") from bs4 import BeautifulSoup import requests from main_site.models import NewsItem import parsedatetime from datetime import datetime print("-" * 100) print("Hackernews parser") print("-" * 100) # url="https://news.ycombinator.com/" urls = [ "https://news.ycombinator.com/news?p=1", "https://news.ycombinator.com/news?p=2", "https://news.ycombinator.com/news?p=3" ] # res=requests.get(url,stream=True) tr1s = [] tr2s = [] for u in urls: res = requests.get(u) html = res.content soup = BeautifulSoup(html) tr1s_tmp = [] tr1s_tmp = soup.findAll('tr', {'class': 'athing'}) tr1s += tr1s_tmp for tr1 in tr1s_tmp: tr2s.append(tr1.findNextSibling()) url = [] title = [] hacker_news_url = [] upvote_count = [] comment_count = [] posted_on = [] for i in range(len(tr1s)): url.append(tr1s[i].select_one('td:nth-of-type(3) > a').get('href')) title.append(tr1s[i].select_one('td:nth-of-type(3) > a').get_text()) # hacker_news_url.append(tr1s[i].select_one('td:nth-of-type(3) > span > a').get('href')) # hacker_news_url.append("https://news.ycombinator.com/"+tr2s[i].select_one('td:nth-of-type(2) > a:nth-of-type(3)').get('href')) if tr2s[i].select_one('td:nth-of-type(2) > span.age > a') == None: hacker_news_url.append('') else: hacker_news_url.append( "https://news.ycombinator.com/" + tr2s[i].select_one( 'td:nth-of-type(2) > span.age > a').get('href')) if tr2s[i].select_one('td:nth-of-type(2) > span.score') == None: upvote_count.append('0') else: upvote_count.append(tr2s[i].select_one( 'td:nth-of-type(2) > span.score').get_text().split()[0]) #exception handling if tr2s[i].select_one('td:nth-of-type(2) > a:nth-of-type(3)') == None: comment_count.append('0') elif tr2s[i].select_one('td:nth-of-type(2) > a:nth-of-type(3)' ).get_text() == "discuss": comment_count.append('0') else: comment_count.append(tr2s[i].select_one( 'td:nth-of-type(2) > a:nth-of-type(3)').get_text().split()[0]) if tr2s[i].select_one('td:nth-of-type(2) > span.age > a') == None: posted_on.append('') else: posted_on.append(tr2s[i].select_one( 'td:nth-of-type(2) > span.age > a').get_text()) # test=['7 hours ago', '58 minutes ago', '13 hours ago'] cal = parsedatetime.Calendar() date_list = [] date_db = [] for date_str in posted_on: time_struct, parse_status = cal.parse(date_str) res = datetime(*time_struct[:6]) # date_list.append(res) date_db.append(res.strftime('%Y-%m-%d %H:%M:%S')) # result = list(reversed([x for _,x in sorted(zip(date_list, posted_on))])) print(posted_on) # print(date_list) print(date_db) # date_list[0].strftime('%Y-%m-%d %H:%M:%S') # print(result) for i in range(len(url)): item, created = NewsItem.objects.get_or_create(url=url[i]) if created: print('New item was created') item.title = title[i] item.hacker_news_url = hacker_news_url[i] item.posted_on = date_db[i] item.comment_count = comment_count[i] item.upvote_count = upvote_count[i] else: print('updating current item') item.title = title[i] item.hacker_news_url = hacker_news_url[i] item.posted_on = date_db[i] item.comment_count = comment_count[i] item.upvote_count = upvote_count[i] item.save() # obj=NewsItem(url=url[i],title=title[i],hacker_news_url=hacker_news_url[i],posted_on=posted_on[i],comment_count=comment_count[i],upvote_count=upvote_count[0]) # obj.save() print(url) print(title) print(hacker_news_url) print(upvote_count) print(comment_count) # print(posted_on) print(date_db)
def parse_date(dtstring): if dtstring.strip(): time_struct, parse_status = dtparser.Calendar().parse(dtstring) return datetime.fromtimestamp(mktime(time_struct)) else: return ""
def validate_content(file, content_dir): """ Validate content for a file based on the rules listed in __doc__ :file: (str) the name of the content file to validate :content_dir: (str) the path to the content directory """ with open(file, 'r') as f: file_contents = [ content.replace('\n', '') for content in f.readlines() ] contents = {} for line in file_contents: items = line.split(':') field, value = cleanse(items[0]), cleanse(':'.join(items[1:])) contents[field] = value status_code = 0 filename = file field = "" short_message = "" long_message = "n/a" loop = True while (loop): loop = False # validate filetype field = '' filetype = os.path.splitext(file)[-1] if not filetype == '.md': status_code = 1 short_message = "invalid filetype" long_message = f"File type `{filetype}` is not supported. Please ensure that your contribution is written in a Markdown file (`.md`)." break # validate name field = 'name' if not 'name' in contents: status_code = 1 short_message = "missing required field `name`" break if contents['name'] == '': status_code = 1 short_message = "empty required field `name`" break if not str(contents['name']): status_code = 1 short_message = "invalid field: `name`" break if not len(contents['name']) <= 100: status_code = 1 short_message = "length of field `name` exceeds 100 characters" break # validate author field = 'author' if not 'author' in contents: status_code = 1 short_message = "missing required field `author`" break if contents['author'] == '': status_code = 1 short_message = "empty required field `author`" break if not str(contents['author']): status_code = 1 short_message = "invalid field `author`" break if not len(contents['author']) <= 100: status_code = 1 short_message = "length of field `author` exceeds 100 characters" break # validate author_github field = 'author_github' author_github = contents.get('author_github', '') if not author_github == '': if not str(author_github): status_code = 1 short_message = "invalid field `author_github`" break if not 'github.com' in author_github: status_code = 1 short_message = "field `author_github` must be a GitHub URL" break try: response = requests.get(author_github, timeout=15) if not response: status_code = 1 short_message = "URL provided for field `author_github` returned an error HTTP code when accessed" long_message = f"The result of accessing \"{contents['author_github']}\" resulted in an HTTP response code of `{response.status_code}`, which is an error." break except Exception as e: status_code = 1 short_message = "URL provided for field `author_github` is inaccessible" long_message = f"Trying to access \"{contents['author_github']}\" resulted in an unknown exception, likely indicating that the URL is invalid." break # validate blurb field = 'blurb' if not 'blurb' in contents: status_code = 1 short_message = "missing required field `blurb`" break if contents['blurb'] == '': status_code = 1 short_message = "empty required field `blurb`" break if not str(contents['blurb']): status_code = 1 short_message = "invalid field `blurb`" break if not len(contents['blurb']) <= 100: status_code = 1 short_message = "length of field `blurb` exceeds 100 characters" break # validate description field = 'description' if not 'description' in contents: status_code = 1 short_message = "missing required field `description`" break if contents['description'] == '': status_code = 1 short_message = "empty required field `description`" break if not str(contents['description']): status_code = 1 short_message = "invalid field `description`" break if not len(contents['description']) <= 1000: status_code = 1 short_message = "length of field `description` exceeds 1000 characters" break # validate url field = 'url' if not 'url' in contents: status_code = 1 short_message = "missing required field `url`" break if contents['url'] == '': status_code = 1 short_message = "empty required field `url`" break if not str(contents['url']): status_code = 1 short_message = "invalid field `url`" break if 'github.com' in contents['url']: status_code = 1 short_message = "URL provided for field `url` is a GitHub repository" long_message = f"It appears the URL for this contribution ({contents['url']}) is a GitHub repository, not a published web app. Unfortunately, Soliloquy is not a portfolio for source code. Please refer to the FAQs for more information: https://www.soliloquy.dev/about/" break try: response = requests.get(contents['url'], timeout=15) if not response: status_code = 1 short_message = "URL provided for field `url` returned an error" long_message = f"The result of accessing \"{contents['url']}\" resulted in an HTTP response code of `{response.status_code}`, which is an error." break except Exception as e: status_code = 1 short_message = "URL provided for field `url` is inaccessible" long_message = f"Trying to access \"{contents['url']}\" resulted in an unknown exception, likely indicating that the URL is invalid." break # validate img field = 'img' img = contents.get('img', '') if not img == '': reserved_file_names = ['about', 'default', 'willcarhartportfolio'] supported_filetypes = ['.png', '.jpg', '.jpeg', '.gif'] image_filename, filetype = os.path.splitext(img) if not filetype in supported_filetypes: status_code = 1 short_message = f"Value of field `img` file type not supported (`{filetype}`)" long_message = "Supported image file types are `.png`, `.jpg`, `.jpeg`, and `.gif`." break if image_filename in reserved_file_names: status_code = 1 short_message = f"filename provided for field `img` is prohibited (`{image_filename}`)" long_message = "Some filenames are reserved for the system, as they are used elsewhere in Soliloquy's assets and thus are prohibited for use in names of contribution images. These filenames are `about`, `default`, and `willcarhartportfolio`, extension agnostic." break if not os.path.isfile(f'{content_dir}/app_img/{img}'): status_code = 1 short_message = f"filename provided for field `img` not found, no such file `{os.path.basename(content_dir)}/app_img/{img}`" long_message = f"Make sure to add your image file to `{os.path.basename(content_dir)}/app_img/`, as this is where Soliloquy will look for it." break # validate timestamp field = 'timestamp' if not 'timestamp' in contents: status_code = 1 short_message = "missing required field `timestamp`" break if contents['timestamp'] == '': status_code = 1 short_message = "empty required field `timestamp`" break if any( re.match(regex, contents['timestamp']) for regex in ['../../.*', '.*/../..', '..-..-.*', '.*-..-..']): status_code = 1 short_message = "value provided for field `timestamp` is ambiguous" long_message = f"The timestamp you provided (`{contents['timestamp']}`) is ambiguous. This means that its value is not deterministic. For example, **\"05/06/2018\"** could be interpreted as **May 6th, 2018** or **June 5th, 2018**." break try: parse = parsedatetime.Calendar() time_struct, parse_status = parse.parse(value) if not parse_status == 1: status_code = 1 short_message = f"invalid field `timestamp`" long_message = f"The provided timestamp `{contents['timestamp']}` could not be parsed. Try using the format **Month Day, Year**, like August 7th, 2019." break dt = datetime.datetime(*time_struct[:6]) timestamp = dt.timestamp() except ValueError: status_code = 1 short_message = "invalid field `timestamp`" long_message = f"The provided timestamp `{contents['timestamp']}` could not be parsed. Try using the format **Month Day, Year**, like August 7th, 2019." break return ContentError(status_code=status_code, filename=os.path.basename(filename), field=field, short_message=short_message, long_message=long_message)
def onLoad(self): self.cal = parsedatetime.Calendar()
def __init__(self, name, emitter=None): super(ScheduledSkill, self).__init__(name, emitter) self.timer = None self.calendar = pdt.Calendar() self.time_rules = time_rules.create(self.lang) self.init_format()
#!/usr/bin/env python import time, calendar, parsedatetime as pdt, pytz, re from datetime import datetime from time import mktime from pytz import timezone from dateutil import tz c = pdt.Constants() c.BirthdayEpoch = 80 p = pdt.Calendar(c) dateFormats = [ "%d/%m/%y-%H:%M:%S", "%d/%m/%y", "%d/%m/%Y", "%d/%m/%y-%H:%M", "%d/%m/%Y-%H:%M:%S", "%d/%m/%y-%H:%M", "%H:%M:%S" ] #will try to match a time given in a form of hh:mm:ss from current time def regexmatch(timestring): pattern = re.compile("^(?:(?:([0-9])*:)?([0-9]*?\d):)?([0-9]*\d)$") hours = mins = secs = 0 if pattern.match(timestring): args = timestring.split(":") if len(args) == 3: hours = int(args[0]) mins = int(args[1]) secs = int(args[2]) elif len(args) == 2: mins = int(args[0]) secs = int(args[1]) else:
def __init__(self, date_format=None, **kwargs): super(Date, self).__init__(**kwargs) self.date_format = date_format self.parser = parsedatetime.Calendar()
def setUp(self): self.cal = pdt.Calendar() self.yr, self.mth, self.dy, self.hr, self.mn, self.sec, self.wd, self.yd, self.isdst = time.localtime()
def parse(*args, **kwargs): cal = parsedatetime.Calendar() dt, flag = cal.parse(*args, **kwargs) return dt
def __init__(self): self.date_constants = parsedatetime.Constants() self.date_calendar = parsedatetime.Calendar(self.date_constants)
def parse_other_answer_page(self, response): c = pdt.Constants() p = pdt.Calendar(c) f = '%Y-%m-%d %H:%M:%S' hxs = HtmlXPathSelector(response) all_answer = hxs.xpath('//ul[contains(@id,"ya-qn-answers")]/li') current_ans_id = response.meta['ult_ans_id'] for single_answer in all_answer: item = YahooItem() ans_data = single_answer.xpath( './/div[contains(@class,"Pt-15")]/span[contains(@class, "Clr-88")]' ).extract() data_string = html2text.html2text(ans_data[0]) data_format = p.parseDT( str( data_string.encode("utf8").replace("\xc2\xb7", "").strip())) item['date_time'] = data_format[0].strftime(f) item['uid'] = str( str(response.meta['quest_id']) + "." + str(current_ans_id)) item['type'] = "answer" item['tags'] = "N/A" item['title'] = "" item['resolve'] = "" item['answers'] = 0 item['views'] = 0 text_to_gain = single_answer.xpath( './/a[contains(@class,"uname Clr-b")]').extract() if text_to_gain: h = html2text.HTML2Text() h.ignore_links = True author_string = h.handle(text_to_gain[0]) item['author'] = str( author_string.encode('utf-8', 'ignore').strip()) else: item['author'] = "anonymous" item['url'] = response.url text_to_gain = single_answer.xpath( './/span[contains(@class,"ya-q-full-text")][@itemprop="text"]' ).extract() if text_to_gain: item['text'] = html2text.html2text(text_to_gain[0]).encode( 'utf-8', 'ignore') else: item['text'] = "" text_to_gain = single_answer.xpath( './/div[contains(@class,"D-ib Mend-10 Clr-93")]/div[1]/div[1]' ).extract() if text_to_gain: item['upvotes'] = int(html2text.html2text(text_to_gain[0])) else: item['upvotes'] = 0 current_ans_id = current_ans_id + 1 yield item try: if (hxs.xpath('//div[contains(@id, "ya-qn-pagination")]' + '/a[contains(@class,"Clr-bl")][last()]/@href')): url_of_the_next_page = hxs.xpath( '//div[contains(@id, "ya-qn-pagination")]' + '/a[contains(@class,"Clr-bl")][last()]/@href').extract() next_page_composed = "https://hk.answers.yahoo.com" + \ url_of_the_next_page[0] request = scrapy.Request(next_page_composed, callback=self.parse_other_answer_page) request.meta['quest_id'] = response.meta['quest_id'] request.meta['ult_ans_id'] = current_ans_id yield request except NoSuchElementException: pass
def process_url(self, url): """ For the given url find or create an entry for each source in the database. If no sources found raise an exception. """ h = httplib2.Http( settings.HTTPLIB2_CACHE_DIR ) response, content = h.request(url) # print content # parse content soup = BeautifulSoup( content, convertEntities=BeautifulStoneSoup.HTML_ENTITIES ) spans = soup.findAll( 'span', 'contenttype-repositoryitem summary') links = [ span.a for span in spans ] # Check that we found some links. This is to detect when the page changes or our # scraper breaks (see issue #905 for example). Checking that the most recent # source is not more that X weeks old might also be a good idea, but could lead # to lots of false positives as there is often a long hiatus. if not len(links): raise NoSourcesFoundError() for link in links: # print '===============' # print link href = link['href'].strip() # print "href: " + href name = ' '.join(link.contents).strip() # print "name: " + name if not Source.objects.filter(name=name).exists(): cal = pdt.Calendar() # Sometimes the space is missing between before the # month, so insert that if it appears to be missing: tidied_name = re.sub(r'(\d+(st|nd|rd|th))(?=[^ ])', '\\1 ', name) # Commas in the name confuse parsedatetime, so strip # them out too: tidied_name = re.sub(r',', '', tidied_name) result = cal.parseDateText(tidied_name) source_date = datetime.date(*result[:3]) # print "source_date: " + str(source_date) # I don't trust that we can accurately create the download link url with the # details that we have. Instead fetche the page and extract the url. download_response, download_content = h.request(href) download_soup = BeautifulSoup( download_content, convertEntities=BeautifulStoneSoup.HTML_ENTITIES ) download_div = download_soup.find( id="archetypes-fieldname-item_files" ) if not download_div: warn("Failed to find the download div on {0}".format(href)) continue download_url = download_div.a['href'] # print download_url # create the source entry Source.objects.create( name = name, url = download_url, date = source_date, )
def parse_page(self, response): # Time tools c = pdt.Constants() p = pdt.Calendar(c) f = '%Y-%m-%d %H:%M:%S' now = datetime.datetime.now() # Start to scraping a single question # Checking question category try: hxs = HtmlXPathSelector(response) category = hxs.xpath( '(//a[contains(@class,"Clr-b")])[2]').extract() h = html2text.HTML2Text() h.ignore_links = True category_text = h.handle(category[0]) url_category = str(category_text).encode('utf8').strip() except IndexError: print(str(self.uid) + "Warning: this Url is not more available...") url_category = "Error" # If the question is related to programming and design # start item creation process # if "程式編寫" and "設計" in url_category: if (True): # increment id # copy id and use uid_copy in order to preserve from concurrent request self.uid = self.uid + 1 uid_copy = self.uid # Print current uid any 100 times if self.uid % 100 == 0: print(str(self.uid)) # Initialize scrapy item item = YahooItem() # Read in the date field associated to URL if info data are present for istance in self.url_to_scrape: if response.url == istance.url: if istance.date == "not available": item['date_time'] = "not available" break else: data_format = p.parseDT( str( str(istance.date).replace("\xc2\xb7", "").strip())) item['date_time'] = data_format[0].strftime(f) break item['type'] = "question" item['uid'] = uid_copy item['url'] = response.url item['tags'] = "N/A" item['views'] = 0 item['upvotes'] = 0 text_to_gain = hxs.xpath('//h1').extract() # Take title of the question item['title'] = (html2text.html2text( text_to_gain[0]).encode("utf8").strip()) # Take text from the question full_text_answer = hxs.xpath( '//span[contains(@class,"ya-q-full-text Ol-n")]').extract() if full_text_answer: item['text'] = html2text.html2text(full_text_answer[0]).encode( 'utf-8', 'ignore') else: text_to_gain = hxs.xpath( '//span[contains(@class,"ya-q-text")]').extract() if text_to_gain: item['text'] = html2text.html2text(text_to_gain[0]).encode( 'utf-8', 'ignore') # Take username of the questioner text_to_gain = hxs.xpath( '//div[contains(@id,"yq-question-detail-profile-img")]' + '/a/img/@alt').extract() if text_to_gain: try: h = html2text.HTML2Text() h.ignore_links = True author_string = h.handle(text_to_gain[0]) item['author'] = author_string.encode('utf-8', 'ignore').strip() # Handle HTMLtoText except except: item['author'] = "anonymous" else: item['author'] = "anonymous" text_to_gain = hxs.xpath( '(//div[contains(@class,"Mend-10 Fz-13 Fw-n D-ib")])' + '[2]/span[2]').extract() # Read number of answers if text_to_gain: if " answers" in (str(html2text.html2text( text_to_gain[0])).strip()): item['answers'] = int( str(html2text.html2text(text_to_gain[0])).replace( " answers", "").strip()) else: if " answer" in (str(html2text.html2text( text_to_gain[0])).strip()): item['answers'] = int( str(html2text.html2text(text_to_gain[0])).replace( " answer", "").strip()) else: item['answers'] = 0 # Check if question is closed (resolve with a best answer) text_to_gain = hxs.xpath( '//span[contains(@class,"ya-ba-title Fw-b")]/text()').extract( ) if text_to_gain: item['resolve'] = "True" else: item['resolve'] = "False" # yield item for the question istance yield item # Taking the best answer if present if hxs.xpath('//div[contains(@id,"ya-best-answer")]'): ans_uid = 1 item = YahooItem() ans_data = hxs.xpath( '(//div[contains(@class,"Pt-15")]/' + 'span[contains(@class, "Clr-88")])[1]').extract() data_string = html2text.html2text(ans_data[0]).strip() data_format = p.parseDT( str( data_string.encode("utf8").replace("\xc2\xb7", "").strip())) item['date_time'] = data_format[0].strftime(f) item['uid'] = str(str(uid_copy) + ("." + str(ans_uid))) item['type'] = "answer" item['resolve'] = "solution" item['tags'] = "N/A" item['title'] = "" item['answers'] = 0 item['views'] = 0 best_text = hxs.xpath( '(//span[contains(@class,"ya-q-full-text")])[1]').extract( ) item['text'] = html2text.html2text(best_text[0]).encode( 'utf-8', 'ignore') text_to_gain = hxs.xpath( '(//a[contains(@class,"uname Clr-b")])[1]').extract() if text_to_gain: h = html2text.HTML2Text() h.ignore_links = True author_string = h.handle(text_to_gain[0]) item['author'] = str( author_string.encode('utf-8', 'ignore').strip()) else: item['author'] = "anonymous" upvote_text = hxs.xpath( '(//div[contains(@class,"D-ib Mstart-23 count")])[1]/text()' ).extract() item['upvotes'] = int( str(html2text.html2text(upvote_text[0])).strip()) item['url'] = response.url ans_uid = ans_uid + 1 yield item else: ans_uid = 1 # Taking all the other answers all_answer = hxs.xpath('//ul[contains(@id,"ya-qn-answers")]/li') for single_answer in all_answer: item = YahooItem() # In this case data is always present ans_data = single_answer.xpath( './/div[contains(@class,"Pt-15")]/span[contains(@class, "Clr-88")]' ).extract() data_string = html2text.html2text(ans_data[0]) data_format = p.parseDT( str( data_string.encode("utf8").replace("\xc2\xb7", "").strip())) item['date_time'] = data_format[0].strftime(f) item['uid'] = str(str(uid_copy) + ("." + str(ans_uid))) item['tags'] = "N/A" item['title'] = "" item['answers'] = 0 item['views'] = 0 item['type'] = "answer" item['resolve'] = "" text_to_gain = single_answer.xpath( './/a[contains(@class,"uname Clr-b")]').extract() if text_to_gain: h = html2text.HTML2Text() h.ignore_links = True author_string = h.handle(text_to_gain[0]) item['author'] = str( author_string.encode('utf-8', 'ignore')) else: item['author'] = "anonymous" # Take url of the question becouse answer don't have URL ref item['url'] = response.url # Check if is present long text version of the answer text_to_gain = single_answer.xpath( './/span[contains(@class,"ya-q-full-text")][@itemprop="text"]' ).extract() if text_to_gain: item['text'] = html2text.html2text(text_to_gain[0]).encode( 'utf-8', 'ignore') else: item['text'] = "" text_to_gain = single_answer.xpath( './/div[contains(@class,"D-ib Mend-10 Clr-93")]/div[1]/div[1]' ).extract() if text_to_gain: item['upvotes'] = int( str(html2text.html2text(text_to_gain[0])).strip()) else: item['upvotes'] = 0 ans_uid = ans_uid + 1 yield item # Checking if there are more then 10 answers # in this case there are other answers in other page try: if (hxs.xpath('//div[contains(@id, "ya-qn-pagination")]' + '/a[contains(@class,"Clr-bl")][last()]/@href')): url_of_the_next_page = hxs.xpath( '//div[contains(@id, "ya-qn-pagination")]' + '/a[contains(@class,"Clr-bl")][last()]/@href').extract( ) next_page_composed = "https://hk.answers.yahoo.com" + \ url_of_the_next_page[0] # Go to the next page and take more urls # passing uid as parameter request = scrapy.Request( next_page_composed, meta={'ans_id': uid_copy}, callback=self.parse_other_answer_page) request.meta['quest_id'] = uid_copy request.meta['ult_ans_id'] = ans_uid yield request except NoSuchElementException: pass else: print(str(self.uid) + " question not available or not related") print(str(response.url))
def setUp(self): self.ptc = pdt.Constants('en_AU', usePyICU=False) self.cal = pdt.Calendar(self.ptc) self.yr, self.mth, self.dy, self.hr, self.mn, self.sec, self.wd, self.yd, self.isdst = time.localtime( )
def parse_time(time_str): cal = parsedatetime.Calendar() time_struct, _ = cal.parse(time_str) return datetime(*time_struct[:6])