def main(): connect = client.rtm_connect() if not connect: print('Slack RTM Connect Error!') return print('Slack RTM Connect Success!') while True: for data in client.rtm_read(): if data['type'] == 'message': if 'bot_id' not in data: parse(data['text']) time.sleep(0.1)
def __init__(self, url, key=None, secret=None, expiration_days=0, private=False, content_type=None, create=True): from boto.s3.connection import S3Connection from boto.s3.key import Key self.url = parse(url) self.expiration_days = expiration_days self.buffer = StringIO() self.private = private self.closed = False self._readreq = True self._writereq = False self.content_type = content_type or mimetypes.guess_type(self.url.path)[0] bucket = self.url.netloc if bucket.endswith(".s3.amazonaws.com"): bucket = bucket[:-17] self.client = S3Connection(key, secret) self.name = "s3://" + bucket + self.url.path if create: self.bucket = self.client.create_bucket(bucket) else: self.bucket = self.client.get_bucket(bucket, validate=False) self.key = Key(self.bucket) self.key.key = self.url.path.lstrip("/") self.buffer.truncate(0)
def is_open(dpmt, course, crn): base = "http://my.illinois.edu" page = blogotubes('http://www.courses.illinois.edu') if not page: print(page); return -1 url = geturl(page, 'Class Schedule') if not url: print(url); return -1 page = blogotubes(base+url) if not page: print('lol'+page); return -1 url = geturl(page, dpmt) if not url: print(url); return -1 page = blogotubes(base+url) # Get list of courses in dpmt if not page: print(page); return -1 url = geturl(page, course) if not url: print(url); return -1 page = blogotubes(base+url) # Get list of sections in course if not page: print(page); return -1 result = parse(page, crn) # Parse openness of section if result: return 1 else: return 0
def migrate(path, name): print('----%s----' % name) input_f = open(path, 'r', encoding='utf-8') quotes = [] prev = '' for line in input_f.readlines(): text, page = parse(line, prev) if len(page) > 0: verifyPage(page, line) pair = dict() pair['text'] = text.lstrip() pair['page'] = page quotes += [pair, ] prev = '' else: prev = text input_f.close() if len(prev): pair['text'] = prev pair['page'] = 0 book = { 'title': name, 'quotes': quotes } return book
def detect_redirect(self): parse = urllib.request.urlparse # the original url org_url = self.url_data # get an opener doing redirections try: opener = self._create_fetcher(redirect_handler=False) response = opener.open(self.url) except: raise UnknownHostName(self.url) # the new url new_url = parse(response.geturl()) # detect a redirection new_loc = new_url.scheme + '://' + new_url.netloc org_loc = org_url.scheme + '://' + org_url.netloc self.is_redirected = not(new_loc == org_loc) if self.is_redirected: self.printer.print_debug_line('%s redirects to %s' % (org_loc, new_loc),2) else: self.printer.print_debug_line('%s does not redirect' % (org_loc, ), 2) # create an response object and add it to the cache R = _create_response(response) self.cache[new_loc] = R self.cache[self.url] = R return (self.is_redirected, new_loc)
def process_response(self, r): def parse(item, type): text = item.xpath('.//td[3]/text()')[0].strip() context = item.xpath('.//td[@class="codeContext"]/text()') where = item.xpath('.//td[@class="linenumber"]/text()')[0] return { 'type': type, 'text': text, 'context': context[0] if context else '', 'where': where } doc = html.document_fromstring(r) return chain((parse(item, 'Error') for item in doc.xpath('//div[@id="errors"]//tr')), (parse(item, 'Warning') for item in doc.xpath('//div[@id="warnings"]//tr')))
def determine_course_status(subject_name, course_name, term): pages = {} # PeopleSoft stores all pages in iframe whose src is continually modified. # Fetch the page inside this iframe. pages['container'] = fetch('https://prdrps2.ehs.ucalgary.ca/psauthent/class-search/public') target_content = parse('[name=TargetContent]', pages['container'])[0] search_form_url = urllib.parse.unquote(target_content['src']) # Fetch class search form. pages['course_search'] = fetch(search_form_url) course_search_url = 'https://prdrps2.ehs.ucalgary.ca/psc/saprd/' + \ 'EMPLOYEE/HRMS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL' # Fetch initial set of search results. pages['search_results_partial'] = fetch_initial_search_results( course_search_url, term, subject_name, course_name, pages['course_search'] ) # Fetch full set of search results. # TODO: for classes where all results are on first page (i.e., those with <= # 3 sections), do not perform this query, but instead simply use # search_results_partial. pages['search_results_full'] = fetch_full_search_results( course_search_url, pages['search_results_partial'] ) return pages['search_results_full']
def fetch_full_search_results(course_search_url, partial_search_results): params = { 'ICAJAX': '1', 'ICType': 'Panel', 'ICElementNum': '0', 'ICStateNum': '57', 'ICAction': '$ICField106$hviewall$0', 'ICXPos': '0', 'ICYPos': '0', 'ICFocus': '', 'ICSaveWarningFilter': '0', 'ICChanged': '-1', 'ICResubmit': '0', 'ICModalWidget': '0', 'ICZoomGrid': '0', 'ICZoomGridRt': '0', 'ICModalLongClosed': '', 'ICActionPrompt': 'false', 'ICFind': '', 'ICAddCount': '', } dynamic_keys = ('ICSID', 'ICStateNum') dynamic_params = {} for key in dynamic_keys: dynamic_params[key] = parse('form[name=win0] input[name=%s]' % key, partial_search_results)[0]['value'] params.update(dynamic_params) return fetch(course_search_url, params)
def hosted_results(self, session_host=None, session_name=None, date_range=None, sort=ct .SORT_TIME, order=ct.ORDER_DESC, page=1): """ Search hosted races results using various fields. Returns a tuple (results, total_results) so if you want all results you should request different pages (using page) until you gather all total_results. Each page has 25 (ct.NUM_ENTRIES) results max.""" lowerbound = ct.NUM_ENTRIES * (page - 1) + 1 upperbound = lowerbound + ct.NUM_ENTRIES - 1 data = {'sort': sort, 'order': order, 'lowerbound': lowerbound, 'upperbound': upperbound} if session_host is not None: data['sessionhost'] = session_host if session_name is not None: data['sessionname'] = session_name if date_range is not None: # Date range tc = lambda s:\ time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d"). timetuple()) * 1000 data['starttime_lowerbound'] = tc(date_range[0]) # multiplied by 1000 data['starttime_upperbound'] = tc(date_range[1]) r = self.__req(ct.URL_HOSTED_RESULTS, data=data) # tofile(r) res = parse(r) total_results = res['rowcount'] results = res['rows'] # doesn't need format_results return results, total_results
def decrypt(self, request, sessionid): """ Avoid showing plain sessionids Optionally require that a referer exists and matches the whitelist, or reset the session """ if not sessionid: return "" # (nonce, sessionid) = sessionid.split(":", 1) # sessionid = self.xor(nonce, sessionid.decode("base64")) secret = self._secret(request) if self.settings.get("HOSTS", []): referer = request.META.get("HTTP_REFERER", "None") if referer == "None": # End session unless a referer is passed return "" url = parse(referer) if url.hostname not in self.settings["HOSTS"]: err = "%s is unauthorised" % url.hostname raise Exception(err) cipher = Fernet(secret) session_key = cipher.decrypt(sessionid) try: return str(session_key, "utf8") except: return ""
def convert(data, field): if isinstance(data, Literal): data = data.value if isinstance(data, URIRef): return str(data) if isinstance(field, IndexedLanguageField): lng = {} for d in data: lang = d.language if not lang: lang = 'null' lng[lang] = str(d) return lng if isinstance(data, list): return [x for x in [convert(x, field) for x in data] if x] elif isinstance(field, IndexedDateTimeField): if data is None: return None if isinstance(data, str): data = parse(data) return data.strftime('%Y-%m-%dT%H:%M:%S') elif data and isinstance(data, FedoraObject): return data.id return data
def list_pages(namespace_url=None): list_url = namespace_url or INDEX_INDEX print('Crawling {}'.format(list_url)) tree = parse(list_url) for a in tree.xpath('//a[@class="twikilink"]'): name = a.text.strip() url = a.attrib['href'] if namespace_url: yield (name,), url else: yield ('Main', name), url if not namespace_url: namespaces = tree.xpath( '//a[starts-with(@href, "index_report.php?groupname=")]' ) for a in namespaces: namespace = a.text.strip() url = urllib.parse.urljoin( INDEX_INDEX, a.attrib['href'] ) for key, value in list_pages(url): assert len(key) == 1 yield (namespace,) + key, value
def parseParms(xfile): if debugMode(): print("parseParms:",xfile) pdict = {} try: statxml = os.stat(xfile) except: print("Error, file",xfile,"not found") return None try: t = parse(xfile) except: print("Error,could not parse",xfile) return None root = t.getroot() kids = list(root) for k in kids: pdict[k.tag] = k.text return pdict
def iratingchart(self, custid=None, category=ct.IRATING_ROAD_CHART): """ Gets the irating data of a driver using its custom id (custid) that generates the chart located in the driver's profile. """ r = self.__req(ct.URL_STATS_CHART % (custid, category), cookie=self.last_cookie) return parse(r)
def get_article(self, candidates, best_candidate): # Now that we have the top candidate, look through its siblings for content that might also be related. # Things like preambles, content split by ads that we removed, etc. sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2]) output = parse("<div/>") for sibling in best_candidate['elem'].parent.contents: if isinstance(sibling, NavigableString): continue append = False if sibling is best_candidate['elem']: append = True sibling_key = HashableElement(sibling) if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.name == "p": link_density = self.get_link_density(sibling) node_content = sibling.string or "" node_length = len(node_content) if node_length > 80 and link_density < 0.25: append = True elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content): append = True if append: output.append(sibling) if not output: output.append(best_candidate) return output
def __check_cookie(self): """ Checks the cookie by testing a request response""" r = parse(self.__req(ct.URL_DRIVER_COUNTS, cookie=self.last_cookie)) if isinstance(r, dict): return True return False
def load_url(parse, url, max_requests = 1, timeout = 60) : requests = [] for i in range( max_requests ) : req = { 'url' : url , 'date' : arrow.now(TZ).format( TIMEFMT ) } requests.append( req ) try: with urllib.request.urlopen(url, timeout=timeout) as conn: req['code'] = conn.getcode() now = arrow.now(TZ) data = parse(conn) return LoadUrlResult( data , now , requests ) except urllib.error.HTTPError as e : req['code'] = e.code now = arrow.now(TZ) raise LoadUrlException( now , requests )
def main(): ''' download file and return it as string ''' cj = http.cookiejar.CookieJar() opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) inputhtml= urllib.request.urlopen(URL1).readlines() print(cj) imgdata = parse(inputhtml) writedata('img.png', imgdata) ocrfix() password = ocrdecode() print (password) postdata = post_data(password) print(postdata) responsehtml= urllib.request.urlopen(URL1, postdata).readlines() resultlines = list(map(lambda x: x.decode("utf-8"), responsehtml)) for r in resultlines: print(r)
def onSend(self): self.getFields() message = self.generateMessage().toLatin1() subject = self.subject_.toLatin1() params = urllib.parse( { "kontakt": "cad", "from_name": self.name_, "from_mail": self.email_, "subject": subject, "kommentar": message, } ) headers = {"Content-type": "application/x-www-form-urlencoded", "Accept": "text/plain"} conn = http.client.HTTPConnection("www.ipek.uni-karlsruhe.de:80") conn.request("POST", "/cms/de/kontakt/kontakt.php", params, headers) response = conn.getresponse() print(response.status, response.reason) data = response.read() conn.close() self.close() return
def inputMove(): moves = [] mc.events.clearAll() while len(moves) < 2: try: chats = mc.events.pollChatPosts() move = parse(chats[0].message) for m in moves: drawSquare(m[0],m[1]) return move except: pass hits = mc.events.pollBlockHits() if len(hits) > 0: c = hits[0].pos if ( corner.x <= c.x and corner.y -1 <= c.y and corner.z <= c.z and c.x < corner.x + 64 and c.y < corner.y + MAXHEIGHT and c.z < corner.z + 64 ): m = (c.x - corner.x) / 8, (c.z - corner.z) /8 if len(moves) == 0 or m[0] != moves[0][0] or m[1] != moves[0][1]: highlightSquare(m[0],m[1]) moves.append(m) time.sleep(0.2) mc.events.clearAll() # debounce continue for m in moves: drawSquare(m[0],m[1]) moves = [] mc.postToChat('Canceled. Enter another move.') time.sleep(0.2) mc.events.clearAll() # debounce time.sleep(0.2) for m in moves: drawSquare(m[0],m[1]) return tuple(moves)
def append_next_page(parsed_urls, page_index, page_url, doc, options): logging.debug("appending next page: %s" % page_url) fetcher = options["urlfetch"] html = fetcher.urlread(page_url) orig_page_doc = parse(html, page_url) next_page_url = find_next_page_url(parsed_urls, page_url, orig_page_doc) page_article = get_article(orig_page_doc, options) log.debug("Appending " + str(page_article)) if page_article.html: page_doc = fragment_fromstring(page_article.html) make_page_elem(page_index, page_doc) if not is_suspected_duplicate(doc, page_doc): # page_doc is a singular element containing the page article elements. We # want to add its children to the main article document to which we are # appending a page. if doc.tag == "html": children = doc.getchildren() if children[0].tag == "head": for elem in page_doc: doc.getchildren()[1].append(elem) else: for elem in page_doc: doc.getchildren()[0].append(elem) else: for elem in page_doc: doc.append(elem) doc.append(page_doc) if next_page_url is not None: append_next_page(parsed_urls, page_index + 1, next_page_url, doc, options)
def __init__(self, content, name=None, namespace=None): self.content = content self._unicode = isinstance(content, str) self.name = name self._parsed = parse(content, name=name) if namespace is None: namespace = {} self.namespace = namespace
def driverdata(self, drivername): """ Personal data of driver using its name in the request (i.e drivername="Victor Beltran"). """ r = self.__req(ct.URL_DRIVER_STATUS % (encode({ 'searchTerms': drivername})), cookie=self.last_cookie) # tofile(r) return parse(r)
def _convertXML(self): """ Convert an XML result into a Python dom tree. This method can be overwritten in a subclass for a different conversion method. @return: converted result @rtype: PyXlib DOM node """ from xml.dom.minidom import parse return parse(self.response)
def driver_search(self, race_type=ct.RACE_TYPE_ROAD, location=ct.LOC_ALL, license=(ct.LIC_ROOKIE, ct.ALL), irating=(0, ct.ALL), ttrating=(0, ct.ALL), avg_start=(0, ct.ALL), avg_finish=(0, ct.ALL), avg_points=(0, ct.ALL), avg_incs=(0, ct.ALL), active=False, sort=ct.SORT_IRATING, page=1, order=ct.ORDER_DESC): """Search drivers using several search fields. A tuple represent a range (i.e irating=(1000, 2000) gets drivers with irating between 1000 and 2000). Use ct.ALL used in the lower or upperbound of a range disables that limit. Returns a tuple (results, total_results) so if you want all results you should request different pages (using page) until you gather all total_results. Each page has 25 (ct.NUM_ENTRIES) results max.""" lowerbound = ct.NUM_ENTRIES * (page - 1) + 1 upperbound = lowerbound + ct.NUM_ENTRIES - 1 search = 'null' friend = ct.ALL # TODO studied = ct.ALL # TODO recent = ct.ALL # TODO active = int(active) # Data to POST data = {'custid': self.custid, 'search': search, 'friend': friend, 'watched': studied, 'country': location, 'recent': recent, 'category': race_type, 'classlow': license[0], 'classhigh': license[1], 'iratinglow': irating[0], 'iratinghigh': irating[1], 'ttratinglow': ttrating[0], 'ttratinghigh': ttrating[1], 'avgstartlow': avg_start[0], 'avgstarthigh': avg_start[1], 'avgfinishlow': avg_finish[0], 'avgfinishhigh': avg_finish[1], 'avgpointslow': avg_points[0], 'avgpointshigh': avg_points[1], 'avgincidentslow': avg_incs[0], 'avgincidentshigh': avg_incs[1], 'lowerbound': lowerbound, 'upperbound': upperbound, 'sort': sort, 'order': order, 'active': active} total_results, drivers = 0, {} try: r = self.__req(ct.URL_DRIVER_STATS, data=data, cookie=self.last_cookie) res = parse(r) total_results = res['d']['32'] header = res['m'] f = res['d']['r'][0] if int(f['29']) == int(self.custid): # 29 is custid drivers = res['d']['r'][1:] else: drivers = res['d']['r'] drivers = format_results(drivers, header) except Exception as e: pprint(("Error fetching driver search data. Error:", e), self.verbose) return drivers, total_results
def findDate(inputValue): try: DateTimeString = str(parse(inputValue, ignoretz=True)) except: #if not found, epoch = 0 DateTimeString = "1970-01-01 00:00:00" # Convert to epoch date time and return the value return toEpoch(DateTimeString)
def _summary(self, enclose_with_html_tag=True): # the first page parsed into a elementree element doc = self.html # the set of urls we've processed so far parsed_urls = set() url = self.options.get("url", None) if url is not None: parsed_urls.add(url) # check the current doc for a next page if requested if self.options.get("multipage", False): next_page_url = find_next_page_url(parsed_urls, url, doc) page_0 = get_article(doc, self.options) page_0_doc = fragment_fromstring(page_0.html) page_index = 0 make_page_elem(page_index, page_0_doc) if enclose_with_html_tag: output = document_fromstring("<div/>") output.getchildren()[0].attrib["id"] = "article" output.getchildren()[0].append(page_0_doc) else: output = fragment_fromstring("<div/>") output.attrib["id"] = "article" output.append(page_0_doc) if next_page_url is not None: append_next_page(parsed_urls, page_index + 1, next_page_url, output, self.options) return Summary( tostring(output), page_0.confidence, short_title=shorten_title(output), title=get_title(output), description=get_description(output), keywords=get_keywords(output), ) summary = get_article(doc, self.options, enclose_with_html_tag=enclose_with_html_tag) print(len(summary.html), "============================") if summary.title == "[something-wrong]" or len(summary.html) < 500: output = parse(self.input_doc, self.options.get("url")) remove_unlikely_candidates(output) o = open("something-wrong.txt", "w") print("[something-wrong]", tostring(output), file=o) return Summary( get_clean_html(output), 0, short_title=shorten_title(output), title=get_title(output), description=get_description(output), keywords=get_keywords(output), ) else: return summary
def get_result_dicts(cls, data, parser, mm_key=None, onlyif=None): if not hasattr(parser, "items"): parser = {"key": parser} if "key" not in parser: yield data return key = parser["key"] rex = None if "regex" in parser: rex = re.compile(parser["regex"], flags=re.I) if key == "@" and mm_key is not None: yield {key: mm_key} return values = cls.get_value(data, key) if values is None: return if not parser.get("match_all", False): values = [values] for val in values: result_dict = OrderedDict() if rex: m = rex.search(val) if not m: return if len(m.groups()) > 0: val = m.groups() if len(val) == 1: val = val[0] urldecode = str(parser.get("urldecode", False)).lower() if urldecode in ("1", "yes", "true"): val = urllib.parse.unquote(val) elif urldecode == "twice": val = urllib.parse.unquote( urllib.parse.unquote(val) ) if "format" in parser: if parser["format"] == "as_list": val = ", ".join(map(str, val)) elif parser["format"] == "as_time": try: dt = datetime.datetime.fromtimestamp(val) except: dt = parse(val) val = dt.isoformat() result_dict[key] = val yield result_dict
def series_raceresults(self, season, raceweek): """ Gets races results of all races of season in specified raceweek """ r = self.__req(ct.URL_SERIES_RACERESULTS, data={'seasonid': season, 'raceweek': raceweek}) # TODO no bounds? res = parse(r) header = res['m'] results = res['d'] results = format_results(results, header) return results
def parse_environ(name, **default_vals): """ same as parse() but you pass in an environment variable name that will be used to fetch the dsn name -- string -- the environment variable name that contains the dsn to parse **default_vals -- dict -- any values you want to have defaults for if they aren't in the dsn return -- ParseResult() tuple """ return parse(os.environ[name], **default_vals)
def request(self, method: str, path: str = "", **kwargs): response = super().request( method, urllib.parse.urljoin(self.base_url.geturl(), path), **kwargs) response.parsed = lambda: parse(response) return response
def rc_by_date(start, end): global df data_load() df = df[(df['Resolved'] > parse(start)) & (df['Resolved'] < parse(end))] #df = df_dt; return df.to_html()
def load_schedule(filename): with open(filename, encoding='utf-8') as f: data = f.read() return parse(data)
def driver_search(self, race_type=ct.RACE_TYPE_ROAD, location=ct.LOC_ALL, license=(ct.LIC_ROOKIE, ct.ALL), irating=(0, ct.ALL), ttrating=(0, ct.ALL), avg_start=(0, ct.ALL), avg_finish=(0, ct.ALL), avg_points=(0, ct.ALL), avg_incs=(0, ct.ALL), active=False, sort=ct.SORT_IRATING, page=1, order=ct.ORDER_DESC): """Search drivers using several search fields. A tuple represent a range (i.e irating=(1000, 2000) gets drivers with irating between 1000 and 2000). Use ct.ALL used in the lower or upperbound of a range disables that limit. Returns a tuple (results, total_results) so if you want all results you should request different pages (using page) until you gather all total_results. Each page has 25 (ct.NUM_ENTRIES) results max.""" lowerbound = ct.NUM_ENTRIES * (page - 1) + 1 upperbound = lowerbound + ct.NUM_ENTRIES - 1 search = 'null' friend = ct.ALL # TODO studied = ct.ALL # TODO recent = ct.ALL # TODO active = int(active) # Data to POST data = { 'custid': self.custid, 'search': search, 'friend': friend, 'watched': studied, 'country': location, 'recent': recent, 'category': race_type, 'classlow': license[0], 'classhigh': license[1], 'iratinglow': irating[0], 'iratinghigh': irating[1], 'ttratinglow': ttrating[0], 'ttratinghigh': ttrating[1], 'avgstartlow': avg_start[0], 'avgstarthigh': avg_start[1], 'avgfinishlow': avg_finish[0], 'avgfinishhigh': avg_finish[1], 'avgpointslow': avg_points[0], 'avgpointshigh': avg_points[1], 'avgincidentslow': avg_incs[0], 'avgincidentshigh': avg_incs[1], 'lowerbound': lowerbound, 'upperbound': upperbound, 'sort': sort, 'order': order, 'active': active } total_results, drivers = 0, {} try: r = self.__req(ct.URL_DRIVER_STATS, data=data, cookie=self.last_cookie) res = parse(r) total_results = res['d']['32'] header = res['m'] f = res['d']['r'][0] if int(f['29']) == int(self.custid): # 29 is custid drivers = res['d']['r'][1:] else: drivers = res['d']['r'] drivers = format_results(drivers, header) except Exception as e: pprint(("Error fetching driver search data. Error:", e), self.verbose) return drivers, total_results
def personal_best(self, custid=None, carid=0): """ Personal best times of driver (custid) using car (carid. check self.CARS) set in official events.""" r = self.__req(ct.URL_PERSONAL_BEST % (carid, custid), cookie=self.last_cookie) return parse(r)
def yearly_stats(self, custid=None): """ Gets yearly stats (top5, top 10, etc.) of driver (custid).""" r = self.__req(ct.URL_YEARLY_STATS % (custid), cookie=self.last_cookie) # tofile(r) return parse(r)
def solve(): export_dir = pathlib.Path('export') conditions = parse(export_dir) print(''.join([conv(condition) for condition in conditions]))
#iterate through tuple of RSS Feeds and look for pubDate datereq = utc.localize( datereq ) #localize the date requirement so we can compare it with the published date i = 0 #int to name each item in list feedList = [] #list to store all feed objects created for x in rssfeeds: i = i + 1 name = "r" + str(i) #make string name NewsFeed = feedparser.parse(x) #parse newsfeed entry = NewsFeed.entries[1] #get first entry from RSS feed dt = parse(entry.published ) #parse the date from the RSS feed pubDate format to datetime feedList.append(Feed(name, entry.title, dt, x)) #creat object and add to feedList if int(mindays) < 2: print("\nThese feeds had no activity for " + mindays + " day :") else: print("\nThese feeds had no activity for " + mindays + " days :") for feed in feedList: if feed.pubdate < datereq: #compare published date to the required date for inactivity if before that then print info print("") print("Most Recent Article: " + feed.title) print("Date Published: " + str(feed.pubdate)) print("RSS Feed URL: " + feed.url + "") print("")
def main(): url = 'https://sadovod.city/category/66' total_pages = get_total_pages(get_html(url, 1)) for i in range(1, total_pages): print(i) parse(get_html(url, i))
fl.close() total_time = time.time() - start mb_sec = (os.path.getsize(o_file) / (1024 * 1024.0)) / total_time print(f"{get_timenow()}: Speed: {mb_sec} MB/s") print(f"{get_timenow()}: Total Time: {total_time} s") temp_size = os.path.getsize(o_file) download_complete = temp_size == filesize_b if not download_complete: percent = temp_size / filesize_b * 100 print(f"{get_timenow()}: Wrote {temp_size} MB ({percent:.2f} %)") print( f"Download not completed somehow. " f"\n Restarting download from where we left off: ({temp_size/(1024 * 1024):.2f} MB)" ) else: print(f"Download completed. Breaking loop. Wrote {temp_size} MB") break cookie = get_request_cookies(inps) if __name__ == '__main__': if len(sys.argv) == 1: sys.argv.append('-h') ### READ IN PARAMETERS FROM THE COMMAND LINE ### inps = parse() download(inps)
def parseStrDate(dateString): try: dateTimeObj = parse(dateString) return dateTimeObj except: return None
def on_data(self, data): global f global filecnt global tweetcnt global chkFlag if tweetcnt >= numTweets and numTweets != 0: print("first") chkFlag = False return False if (filecnt >= 500): print("filecnt") chkFlag = False return False if (f.tell() >= 10485760): print("last") f.close() chkFlag = True filecnt += 1 outputPath = dirName outputPath += '/' outputPath += 'twitter_data' outputPath += str(filecnt) outputPath += '.txt' f = open(outputPath, 'a') decoded = json.loads(data) username = str(decoded['user']['screen_name']).encode( "ascii", "ignore") userTweet = str(decoded['text']).encode("ascii", "ignore") userTweet = userTweet.replace('\n', ' ').replace('\t', '').replace('\r', '') userTweetTime = str(decoded['created_at']) #gets timestamp userLocation = str(decoded['user']['location']).encode( "ascii", "ignore") userCoords = str(decoded['coordinates']).encode("ascii", "ignore") userURLS = str(decoded['entities']['urls']).encode("ascii", "ignore") userData = "Date:" + userTweetTime + " Coords:" + userCoords[ 36:-1] + " User:"******" Text:" + userTweet userData += " Hashtags:" userHashtags = decoded['entities']['hashtags'] if (userHashtags != "[]"): tmp = decoded['text'] for Hashtags in userHashtags: userHashtags = str(Hashtags['text']).encode("ascii", "ignore") userData += userHashtags + " " #url pageTitle = None userData += " URL:" if userURLS != "[]": expanded_url = str( decoded['entities']['urls'][0]['expanded_url']).encode( "ascii", "ignore") userData += expanded_url try: page = urllib.request.urlopen(expanded_url) p = parse(page) pageT = p.find(".//title") if (pageT != None): pageTitle = str(p.find(".//title").text).encode( "ascii", "ignore") except urllib.error.HTTPError as err: if err.code == 404: print("Page not found!") elif err.code == 403: print("Access denied!") else: print("Error:", err.code) except urllib.error.URLError as err: print("URL error:", err.reason) except BadStatusLine: print("Could not fetch URL") userData += " Title:" if (pageTitle != None): pageTitle = re.sub('[^A-Za-z0-9]+', ' ', pageTitle) userData += pageTitle tweetcnt += 1 print('Tweet:', tweetcnt, ' F.size = ', f.tell(), ' on file:', filecnt) userData += "\n" print(userData) f.write(userData) return True
dest="minimize_boolean_attributes", help="minimize boolean attributes") parser.add_option("", "--use-trailing-solidus", action="store_true", default=False, dest="use_trailing_solidus", help="use trailing solidus") parser.add_option("", "--space-before-trailing-solidus", action="store_true", default=False, dest="space_before_trailing_solidus", help="add space before trailing solidus") parser.add_option("", "--escape-lt-in-attrs", action="store_true", default=False, dest="escape_lt_in_attrs", help="escape less than signs in attribute values") parser.add_option("", "--escape-rcdata", action="store_true", default=False, dest="escape_rcdata", help="escape rcdata element values") parser.add_option("", "--sanitize", action="store_true", default=False, dest="sanitize", help="sanitize") parser.add_option("-l", "--log", action="store_true", default=False, dest="log", help="log state transitions") return parser if __name__ == "__main__": parse()
data = { 'working_hours': working_hours, 'info': info, 'ratings_histogram': ratings_histogram, 'name': name, 'phone': phone, 'ratings': ratings, 'address': address, 'health_rating': health_rating, 'price_range': price_range, 'claimed_status': claimed_status, 'reviews': reviews, 'category': category, 'website': website, 'latitude': latitude, 'longitude': longitude, 'url': url } return data if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument('url', help='yelp bussiness url') args = argparser.parse_args() url = args.url scraped_data = parse(url) yelp_id = url.split('/')[-1] with open("scraped_data_%s.json" % yelp_id, 'w') as fp: json.dump(scraped_data, fp, indent=4)
def career_stats(self, custid=None): """ Gets career stats (top5, top 10, etc.) of driver (custid).""" r = self.__req(ct.URL_CAREER_STATS % (custid), cookie=self.last_cookie) return parse(r)[0]
#import xml.etree.ElementTree as ET from xml.etree.ElementTree import parse import ssl # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE #Actual Data #Prompt for URL for xml file and parse the XML link = input('Enter - ') url = urlopen(link) #url = urlopen('http://py4e-data.dr-chuck.net/comments_42.xml') xmldoc = parse(url) print("Retrieving ", link) #Determine the number of count entries in xml tree counts = xmldoc.findall('.//count') #counts = xml_tree.findall('./comments/comment/count') print("Count: ", (len(counts))) #This will be a runninng tally of numbers extracted from xml count_list = [] for item in xmldoc.iterfind('comments/comment'): count_number = int(item.findtext('count')) count_list.append(count_number)
def cars_driven(self, custid=None): """ Gets list of cars driven by driver (custid).""" r = self.__req(ct.URL_CARS_DRIVEN % (custid), cookie=self.last_cookie) # tofile(r) return parse(r)
def parse_product(self, response): tmp = [] for p in response.xpath("//table//tr//td[2]"): tmp.append(p) title = tmp[0].xpath("./p/text()").extract()[0] url = urllib.parse.urljoin(self.download_path, tmp[3].xpath("./a/@href").extract()[0]) def parse(title): print(title) product = version = date = None tmp = title.split(' ') product = tmp[0] if len(tmp) == 2: #MR814v1_070807 升级程序 if '_' in tmp[0]: tmp2 = tmp[0].split('_') version = tmp2[0] date = tmp2[1][:6] #MWR300T V1(081210)标准版 elif tmp[1][0] in ['v', 'V']: pass else: tmp2 = tmp[1].split('_') version = tmp2[0] date = tmp2[1][:6] elif len(tmp) == 3: tmp2 = tmp[1].split('_') version = tmp2[0] date = tmp2[1] if version: if version[0] not in ['v', 'V']: if 'v' in product: t = product.split('v') product = t[0] version = t[1] #MR814v1_070807 升级程序 if product.count('_'): tmp = product.split('_') product = tmp[0] if product.count('v'): product = product.split('v')[0] elif product.count('V'): product = product.split('v')[0] return product, version, date product, version, date = parse(title) item = FirmwareLoader(item=FirmwareImage()) item.add_value("url", url), item.add_value("product", product), #item.add_value("date", date), #item.add_value("version", version), item.add_value("vendor", self.vendor), item.add_value("description", title) yield item.load_item()
def lastrace_stats(self, custid=None): """ Gets stats of last races (10 max?) of driver (custid).""" r = self.__req(ct.URL_LASTRACE_STATS % (custid), cookie=self.last_cookie) return parse(r)
def is_date(string): try: parse(string, fuzzy=False) return True except ValueError: return False
def results_archive(self, custid=None, race_type=ct.RACE_TYPE_ROAD, event_types=ct.ALL, official=ct.ALL, license_level=ct.ALL, car=ct.ALL, track=ct.ALL, series=ct.ALL, season=(2014, 1, ct.ALL), date_range=ct.ALL, page=1, sort=ct.SORT_TIME, order=ct.ORDER_DESC): """ Search race results using various fields. Returns a tuple (results, total_results) so if you want all results you should request different pages (using page). Each page has 25 (ct.NUM_ENTRIES) results max.""" format_ = 'json' lowerbound = ct.NUM_ENTRIES * (page - 1) + 1 upperbound = lowerbound + ct.NUM_ENTRIES - 1 # TODO carclassid, seriesid in constants data = { 'format': format_, 'custid': custid, 'seriesid': series, 'carid': car, 'trackid': track, 'lowerbound': lowerbound, 'upperbound': upperbound, 'sort': sort, 'order': order, 'category': race_type, 'showtts': 0, 'showraces': 0, 'showquals': 0, 'showops': 0, 'showofficial': 0, 'showunofficial': 0, 'showrookie': 0, 'showclassa': 0, 'showclassb': 0, 'showclassc': 0, 'showclassd': 0, 'showpro': 0, 'showprowc': 0, } # Events ev_vars = { ct.EVENT_RACE: 'showraces', ct.EVENT_QUALY: 'showquals', ct.EVENT_PRACTICE: 'showops', ct.EVENT_TTRIAL: 'showtts' } if event_types == ct.ALL: event_types = (ct.EVENT_RACE, ct.EVENT_QUALY, ct.EVENT_PRACTICE, ct.EVENT_TTRIAL) for v in event_types: data[ev_vars[v]] = 1 # Official, unofficial if official == ct.ALL: data['showofficial'] = 1 data['showunoofficial'] = 1 else: if ct.EVENT_UNOFFICIAL in official: data['showunofficial'] = 1 if ct.EVENT_OFFICIAL in official: data['showofficial'] = 1 # Season if date_range == ct.ALL: data['seasonyear'] = season[0] data['seasonquarter'] = season[1] if season[2] != ct.ALL: data['raceweek'] = season[2] else: # Date range tc = lambda s:\ time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d"). timetuple()) * 1000 data['starttime_low'] = tc(date_range[0]) # multiplied by 1000 data['starttime_high'] = tc(date_range[1]) # License levels lic_vars = { ct.LIC_ROOKIE: 'showrookie', ct.LIC_A: 'showclassa', ct.LIC_B: 'showclassb', ct.LIC_C: 'showclassc', ct.LIC_D: 'showclassd', ct.LIC_PRO: 'showpro', ct.LIC_PRO_WC: 'showprowc' } if license_level == ct.ALL: license_level = (ct.LIC_ROOKIE, ct.LIC_A, ct.LIC_B, ct.LIC_C, ct.LIC_D, ct.LIC_PRO, ct.LIC_PRO_WC) for v in license_level: data[lic_vars[v]] = 1 r = self.__req(ct.URL_RESULTS_ARCHIVE, data=data, cookie=self.last_cookie) res = parse(r) total_results, results = 0, [] if len(res['d']): total_results = res['d']['46'] results = res['d']['r'] header = res['m'] results = format_results(results, header) return results, total_results
def parse_iaga(lines, iagacode=None): ''' KyotoWDC uses two format types: WDC, which is data specific, and IAGA-2002, which is general for all data types. This function is a general reader for this format. It returns a dictionary of vectors, each corresponding to a column from the file. 'lines' is simply a list of lines from the IAGA-formatted file. 'iagacode', if given, should be a string containing the IAGA code for the file contents. If given, this function will raise an exception if iagacode does not match the file's code. This is useful for ensure the correct data values are located in this file. ''' from dateutil.parser import parse # Begin by parsing header; ensuring the correct file format. fmt=(lines.pop(0)).split() if (fmt[0]!='Format') or (fmt[1]!='IAGA-2002'): raise Exception('Data is not in IAGA-2002 format.') # Parse mandatory IAGA header lines. source=(lines.pop(0)).split()[1] lines.pop(0) code=(lines.pop(0)).split()[2] for i in range(8): lines.pop(0) # Check Iaga Code as necessary. if iagacode: if iagacode != code: raise Exception("IAGA Code does not match required code.") # Loop through and count optional header lines. nHead=12 while True: line=lines.pop(0) if line[:2]!=' #': break nHead+=1 # Parse column labels. We don't need time or DOY. parts=line.lower().split()[3:-1] data={'time':[], 'doy':[]} for name in parts: data[name]=[] # Read all data. for l in lines: if l[-2]=='|':continue # skip repeat headers. p=l.split() data['time'].append(parse(' '.join(p[0:2]))) data['doy'].append(int(p[2])) for i,name in enumerate(parts): data[name].append(float(p[i+3])) # Convert to dmarrays. for name in data: data[name]=dmarray(data[name]) return data
f = open(f, "rb") parser.ParseFile(f) f.close() else: parser.ParseFile(f) except Exception as e: raise OwlReadyOntologyParsingError("OWL/XML parsing error in file %s, line %s, column %s." % (getattr(f, "name", "???"), parser.CurrentLineNumber, parser.CurrentColumnNumber)) from e return nb_triple def _rindex(l): i = len(l) - 1 while l[i] != "(": i -= 1 return i if __name__ == "__main__": filename = sys.argv[-1] import time t = time.time() nb_triple = parse(filename) t = time.time() - t print("# %s triples read in %ss" % (nb_triple, t), file = sys.stderr)
def main(): if not os.path.exists(datadir): os.makedirs(datadir) parse()
def search(self, search_string, season=None, episode=None): if season and episode: searches = self.se_ep(search_string, season, episode) else: searches = [search_string] # get token for api url = '{}?get_token=get_token&app_id=tvoverlord'.format(self.baseurl) try: r = requests.get(url) except requests.exceptions.ConnectionError: return [] if r.status_code == 403: self.url = url return [] j = r.json() token = j['token'] torrents = [] count = 0 loop_number = 0 for search in searches: # the torrentapi only allows one query every two seconds if count > 0: time.sleep(2) count += 1 search_tpl = '{}?mode=search&search_string={}&token={}&format=json_extended&sort=seeders&limit=100&app_id=tvoverlord' search_string = urllib.parse.quote(search) url = search_tpl.format(self.baseurl, search_string, token) try: loop_number += 1 self.logger.info('%s[%s]@%s via "%s"' % (self.job_id, self.shortname, loop_number, url)) r = requests.get(url) except requests.exceptions.ConnectionError: # can't connect, go to next url continue results = r.json() if 'error_code' in results.keys() and results['error_code'] == 20: continue # no results found try: shows = results['torrent_results'] except KeyError: # no results continue for show in shows: torrent = Torrent() torrent.title = show['title'] torrent.date = parse(show['pubdate'].split(' ')[0]) torrent.size = int(show['size']) torrent.seeders = int(show['seeders']) torrent.magnet = show['download'] torrent.tracker = self.shortname torrents.append(torrent) self.logger.info('%s[%s]@%s found %s result(s)' % (self.job_id, self.shortname, loop_number, len(torrents))) if len(torrents) != 0: return torrents # We got this far with no results self.logger.info('%s[%s] exiting without any results' % (self.job_id, self.shortname)) return torrents
def _tokenize(s): """Removes conditional macros and splits string on macro boundaries""" def parse(inp): tree = [] text = '' macro = '' buf = '' escape = False while inp: c = inp.pop(0) if c == '%': c = inp.pop(0) if c == '%': text += c elif c == '{': if text: tree.append(('t', text)) text = '' while inp and c not in ':}': c = inp.pop(0) buf += c if c == ':': tree.append(('c', buf[:-1], parse(inp))) buf = '' elif c == '}': tree.append(('m', buf[:-1])) buf = '' elif c == '(': if text: tree.append(('t', text)) text = '' tree.append(('s', None, parse(inp))) else: if text: tree.append(('t', text)) text = '' while inp and (c.isalnum() or c == '_'): c = inp.pop(0) macro += c tree.append(('m', macro)) macro = '' elif c == '$': text += c c = inp.pop(0) if c == '{': text += c escape = True elif c == '}': if escape: text += c escape = False else: if text: tree.append(('t', text)) inp.append(c) return tree elif c == ')': if text: tree.append(('t', text)) inp.append(c) return tree else: text += c if text: tree.append(('t', text)) return tree def traverse(tree): result = [] for node in tree: if node[0] == 't': # split text nodes on usual separators result.extend([t for t in re.split(r'(\.|-|_)', node[1]) if t]) elif node[0] == 'm': m = '%{{{}}}'.format(node[1]) if MacroHelper.expand(m): result.append(m) elif node[0] == 'c': if MacroHelper.expand('%{{{}:1}}'.format(node[1])): result.extend(traverse(node[2])) elif node[0] == 's': # ignore shell expansions, push nonsensical value result.append('@') return result inp = list(s) tree = parse(inp) return traverse(tree)
def parse(inp): tree = [] text = '' macro = '' buf = '' escape = False while inp: c = inp.pop(0) if c == '%': c = inp.pop(0) if c == '%': text += c elif c == '{': if text: tree.append(('t', text)) text = '' while inp and c not in ':}': c = inp.pop(0) buf += c if c == ':': tree.append(('c', buf[:-1], parse(inp))) buf = '' elif c == '}': tree.append(('m', buf[:-1])) buf = '' elif c == '(': if text: tree.append(('t', text)) text = '' tree.append(('s', None, parse(inp))) else: if text: tree.append(('t', text)) text = '' while inp and (c.isalnum() or c == '_'): c = inp.pop(0) macro += c tree.append(('m', macro)) macro = '' elif c == '$': text += c c = inp.pop(0) if c == '{': text += c escape = True elif c == '}': if escape: text += c escape = False else: if text: tree.append(('t', text)) inp.append(c) return tree elif c == ')': if text: tree.append(('t', text)) inp.append(c) return tree else: text += c if text: tree.append(('t', text)) return tree
def driver_counts(self): """ Gets list of connected myracers and notifications. """ r = self.__req(ct.URL_DRIVER_COUNTS, cookie=self.last_cookie) return parse(r)
# print(final) except BaseException as e: print(e) def parse(uid): #设置referer和headers Referer='https://m.weibo.cn/u/{}'.format(uid) headers_1 = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Referer":Referer } #使用firefox抓包工具抓取真实的地址,并分析参数,需要设置UID和page for i in range(1,11): url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={}&containerid=107603{}&page={}'.format(uid,uid,i) #抓取页面后将json转换成dic并提取信息 webdata = session.get(url, headers=headers_1).text data = json.loads(webdata) news = data['data']['cards'] for new in news: try: info = new['mblog']['text'] #替换所有的HTML标签 import re dr = re.compile(r'\<.*?\>', re.S) dd = dr.sub('', info) print(dd) except BaseException as e: print(e) login(username='******',password='******') parse('1173544654')
def authenticate(self, request): if not settings.HAUKI_SIGNED_AUTH_PSK: return None params = get_auth_params(request) if not len(params): return None if not all([params.get(k) for k in REQUIRED_AUTH_PARAM_NAMES]): return None data_string = join_params(params) calculated_signature = calculate_signature(data_string) if not compare_signatures(params["signature"], calculated_signature): raise exceptions.AuthenticationFailed(_("Invalid signature")) try: created_at = parse(params["created_at"]) try: if created_at > timezone.now(): raise exceptions.AuthenticationFailed( _("Invalid created_at")) except TypeError: raise exceptions.AuthenticationFailed(_("Invalid created_at")) except ValueError: raise exceptions.AuthenticationFailed(_("Invalid created_at")) try: valid_until = parse(params["valid_until"]) try: if valid_until < timezone.now(): raise exceptions.AuthenticationFailed( _("Invalid valid_until")) except TypeError: raise exceptions.AuthenticationFailed(_("Invalid valid_until")) except ValueError: raise exceptions.AuthenticationFailed(_("Invalid valid_until")) # TODO: Add separate PSKs for different integrations and only allow access # to users initially from the same integration. Also Only allow # using organisations from the same integration. try: user = User.objects.get(username=params["username"]) except User.DoesNotExist: user = User() user.set_unusable_password() user.username = params["username"] user.save() if not user.is_active: raise exceptions.AuthenticationFailed( _("User inactive or deleted.")) if params.get("organization"): try: organization = Organization.objects.get( id=params["organization"]) users_organizations = user.organization_memberships.all() if organization not in users_organizations: user.organization_memberships.add(organization) except Organization.DoesNotExist: # TODO: Should we raise exception here pass return user, None