def dimapi(url, api): # REVIEW try: uri = Urlattributes(api) raw = uri.gettext() # result = literal_eval(raw[1:-2]) return raw except WebcredError: raise WebcredError("Give valid API") except: return 'NA'
def funcBrokenllinks(url): result = False if url: try: Urlattributes(url) except WebcredError: result = True return result
def getImgratio(url): total_img_size = 0 threads = [] text_size = url.getsize() soup = url.getsoup() # total_img_size of images for link in soup.find_all('img', src=True): uri = link.get('src', None) if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): try: uri = Urlattributes(uri) Method = funcImgratio Name = 'Imgratio' Url = uri func = Method t = MyThread(func, Name, Url) t.start() threads.append(t) except Exception: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() # Extract unformatter stack traces as tuples trace_back = traceback.extract_tb(ex_traceback) # Format stacktrace stack_trace = list() for trace in trace_back: stack_trace.append( "File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3])) # print("Exception type : %s " % ex_type.__name__) if ex_value.message != 'Response 202': logger.warning(ex_value) logger.debug(stack_trace) for t in threads: t.join() size = t.getResult() t.freemem() if isinstance(size, int): total_img_size += size # print total_img_size total_size = total_img_size + text_size ratio = float(text_size) / total_size return ratio
def googleinlink(url): API_KEY = os.environ.get('Google_API_KEY') inlinks = None try: # keyword link is used in search query to search only hyperlinks uri = ('https://www.googleapis.com/customsearch/v1?key=' + API_KEY + '&cx=017576662512468239146:omuauf_lfve&q=link:' + url.getoriginalurl()) uri = Urlattributes(uri) txt = uri.gettext() for line in txt.splitlines(): if "totalResults" in line: break inlinks = int(re.sub("[^0-9]", "", line)) except Exception: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() # Extract unformatter stack traces as tuples trace_back = traceback.extract_tb(ex_traceback) # Format stacktrace stack_trace = list() for trace in trace_back: stack_trace.append( "File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3])) # print("Exception type : %s " % ex_type.__name__) logger.info('Inlinks error {}'.format(ex_value)) logger.debug(stack_trace) return inlinks
def getAlexarank(url): if not isinstance(url, Urlattributes): url = Urlattributes(url) uri = "http://data.alexa.com/data?cli=10&dat=s&url=" + url.geturl() uri = Urlattributes(uri) soup = uri.getsoup() try: rank = soup.find("reach")['rank'] except: rank = None return rank
def getWot(url): if not isinstance(url, Urlattributes): url = Urlattributes(url) result = ("http://api.mywot.com/0.4/public_link_json2?hosts=" + url.geturl() + "/&callback=&key=d60fa334759ae377ceb9cd679dfa22aec57ed998") uri = Urlattributes(result) raw = uri.gettext() result = literal_eval(raw[1:-4]) result = str(result).split(']')[0].split('[')[-1].split(',') data = None if isinstance(result, list) and len(result) == 2: data = {} data['reputation'] = int(result[0]) data['confidence'] = int(result[1]) return data
def assess(self): now = datetime.now() if not isinstance(self.request, dict): self.request = dict(self.request.args) data = {} req = {} req['args'] = {} percentage = {} site = None dump = True try: # get percentage of each feature # and copy self.request to req['args'] # TODO come back and do this properly for keys in apiList.keys(): if self.request.get(keys, None): # because self.request.args is of ImmutableMultiDict form if isinstance(self.request.get(keys, None), list): req['args'][keys] = str(self.request.get(keys)[0]) perc = keys + "Perc" if self.request.get(perc): percentage[keys] = self.request.get(perc)[0] else: req['args'][keys] = self.request.get(keys) perc = keys + "Perc" if self.request.get(perc): percentage[keys] = self.request.get(perc) # to show wot ranking # req['args']['wot'] = "true" data['url'] = req['args']['site'] site = Urlattributes(url=req['args'].get('site', None)) # get genre # WARNING there can be some issue with it data['genre'] = self.request.get('genre', None) if data['url'] != site.geturl(): data['redirected'] = site.geturl() data['lastmod'] = site.getlastmod() # site is not a WEBCred parameter del req['args']['site'] # check database, # if url is already present? if self.db.filter('url', data['url']).count(): ''' if lastmod not changed update only the columns with None value else update every column ''' if self.db.filter( 'lastmod', data['lastmod']).count() or not data['lastmod']: # get all existing data in dict format data = self.db.getdata('url', data['url']) # check the ones from columns which have non None value ''' None value indicates that feature has not successfully extracted yet ''' for k, v in data.items(): if v or str(v) == '0': # always assess loadtime if k != 'pageloadtime': req['args'][k] = 'false' dump = False else: data = self.db.getdata('url', data['url']) data = self.extractValue(req, apiList, data, site) # HACK 13 is calculated number, refer to index.html, where new # dimensions are dynamically added # create percentage dictionary number = 13 # TODO come back and do this properly print() print() print() print("no error 1") print() print() print() while True: dim = "dimension" + str(number) API = "api" + str(number) if dim in self.request.keys(): try: data[self.request.get(dim)[0]] = surface.dimapi( site.geturl(), self.request.get(API)[0]) perc = API + "Perc" percentage[dim] = self.request.get(perc)[0] except WebcredError as e: data[self.request.get(dim)[0]] = e.message except: data[self.request.get(dim)[0]] = "Fatal ERROR" else: break number += 1 print() print() print() print("no error 2") print() print() print() data = webcredScore(data, percentage) data['error'] = None print("data1error ", data['error']) except WebcredError as e: data['error'] = e.message print('python error') print() dump = False except Exception: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() # Extract unformatter stack traces as tuples trace_back = traceback.extract_tb(ex_traceback) # Format stacktrace stack_trace = list() for trace in trace_back: stack_trace.append( "File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3])) # print("Exception type : %s " % ex_type.__name__) logger.info(ex_value) logger.debug(stack_trace) # HACK if it's not webcred error, # then probably it's python error data['error'] = 'Fatal Error' dump = False logger.debug(data['url']) finally: now = str((datetime.now() - now).total_seconds()) data['assess_time'] = now # store it in data self.db.update('url', data['url'], data) # dump text and html of html if dump: self.dumpRaw(site) data = self.db.getdata('url', data['url']) # prevent users to know of dump location del data['html'] del data['text'] logger.debug(data['url']) logger.debug('Time = {}'.format(now)) print("data2error ", data['error']) return data