def __init__(self, lang_iso=None, ads_list=None): if lang_iso: try: iso = open(lang_iso, "r") self.isoList = iso.read().split() isoList = [] for code in self.isoList: # isoList.pop(iso) isoList.append(str('=' + code)) isoList.append(str('="' + code + '"')) self.isoList = isoList self.isoPattern = self.regexCompile(self.isoList) iso.close() except WebcredError as e: raise WebcredError(e.message) except: raise WebcredError('Unable to open {} file'.format(lang_iso)) else: logger.debug('Provide Language iso file') if ads_list: try: ads = open(ads_list, "r") self.adsList = ads.read().split() self.adsPattern = self.regexCompile(self.adsList) ads.close() print('successfull with ads compilation') except WebcredError as e: raise WebcredError(e.message) except: raise WebcredError('Unable to open {} file'.format(ads_list)) else: logger.debug('Provide a good ads list')
def factoise(self): if not self.factorise: raise WebcredError('Provide attr to factorise') global lastmodMaxMonths for index in range(len(self.data)): if self.data[index].get(self.name): modified = 0 # condition for lastmod if self.name == "lastmod": value = self.data[index][self.name] value = self.getDateDifference(value) if value < lastmodMaxMonths: self.data[index][self.name] = self.factorise.get( lastmodMaxMonths) modified = 1 # condition for everything else else: value = self.data[index][self.name] for k, v in self.factorise.items(): if str(value) == str(k): self.data[index][self.name] = v modified = 1 if not modified: if 'else' in self.factorise.keys(): self.data[index][self.name] = self.factorise.get( 'else') return self.data
def geturllibreq(self): # with self.lock: if not self.urllibreq: try: now = datetime.now() self.urllibreq = requests.get(url=self.url, headers=self.hdr) self.loadTime = int((datetime.now() - now).total_seconds()) except Exception: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() # Extract unformatter stack traces as tuples trace_back = traceback.extract_tb(ex_traceback) # Format stacktrace stack_trace = list() for trace in trace_back: stack_trace.append( "File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3])) # print("Exception type : %s " % ex_type.__name__) raise WebcredError(ex_value) # logger.info(stack_trace) # HACK if it's not webcred error, # then probably it's python error # print self.urllibreq.geturl() return self.urllibreq
def getsize(self): if not self.size: t = self.gettext() try: self.size = len(t) except: raise WebcredError('error in retrieving length') return self.size
def getsoup(self, parser='html.parser'): data = self.getrequests().text try: self.soup = BeautifulSoup(data, parser) except: raise WebcredError('Error while parsing using bs4') return self.soup
def regexMatch(self, pattern=None, data=None): if not pattern: raise WebcredError('Provide regex pattern') if not data: raise WebcredError('Provide data to match with pattern') for element in pattern: match = element.search(data) if match: break if match: return True, element.pattern else: return False, None
def getdomain(self): if not self.domain: try: netloc = self.getnetloc() self.domain = netloc.split('.')[-1] except: raise WebcredError('provided {} not valid'.format(netloc)) return self.domain
def regexCompile(self, data=None): if not data: raise WebcredError('Provide data to compile') pattern = [] for element in data: temp = re.compile(re.escape(element), re.X) pattern.append(temp) return pattern
def dimapi(url, api): # REVIEW try: uri = Urlattributes(api) raw = uri.gettext() # result = literal_eval(raw[1:-2]) return raw except WebcredError: raise WebcredError("Give valid API") except: return 'NA'
def __init__(self, url=None): # print 'here' if patternMatching: self.patternMatching = patternMatching self.hdr = {'User-Agent': 'Mozilla/5.0'} self.requests = self.urllibreq = self.soup = self.text = None self.netloc = self.header = self.lastmod = self.size = \ self.html = self.domain = self.loadTime = None self.lock = threading.Lock() if url: if not validators.url(url): raise WebcredError('Provide a valid url') self.url = url self.originalUrl = copy.deepcopy(url) # case of redirections resp = self.getrequests() if resp.status_code / 100 >= 4: raise WebcredError('Response 202') self.url = resp.url else: raise WebcredError('Provide a url')
def __init__(self, data=None, name=None): if not data or not name: raise WebcredError('Need 3 args, 2 pass') self.reverse = self.dataList = self.mean = self.deviation = None self.factorise = None self.data = data self.name = name[0] if isinstance(name[1], str): if name[1] == 'reverse': self.reverse = True elif isinstance(name[1], dict): self.factorise = name[1]
def getPatternObj(self): try: return self.patternMatching except: raise WebcredError('Pattern Obj is NA')
class Urlattributes(object): # HACK come back and do this properly try: # TODO fetch ads list dynamically from org if not patternMatching: patternMatching = PatternMatching( lang_iso='data/essentials/lang_iso.txt', ads_list='data/essentials/easylist.txt') print 'end patternMatching' global normalizedData global normalizeCategory if not normalizedData: normalizedData = {} # read existing data old_data = 'data/json/data2.json' old_data = open(old_data, 'r').read() old_data = old_data.split('\n') new_data = 'data/json/new_data.json' new_data = open(new_data, 'r').read() new_data = new_data.split('\n') re_data = 'data/json/re_data.json' re_data = open(re_data, 'r').read() re_data = re_data.split('\n') # list with string/buffer as values file_ = list(set(new_data + old_data + re_data)) # final json_List of data data = [] for element in file_[:-1]: try: metadata = json.loads(str(element)) # if metadata.get('redirected'): # url = metadata['redirected'] # else: # url = metadata['Url'] # obj = utils.Domain(url) # url = obj.getnetloc() # metadata['domain_similarity'] = scorefile_data.get(url) except: continue if metadata.get('Error'): continue data.append(metadata) # get data from postgres db = Database(Features) data = db.getdbdata() it = normalizeCategory['3'].items() for k in it: normalizedData[k[0]] = Normalize(data, k) data = normalizedData[k[0]].normalize() it = normalizeCategory['misc'].items()[0] # summation of hyperlinks_attribute values for index in range(len(data)): if data[index].get(it[0]): sum_hyperlinks_attributes = 0 tempData = data[index].get(it[0]) try: for k, v in tempData.items(): sum_hyperlinks_attributes += v except: # TimeOut error clause pass finally: data[index][it[0]] = sum_hyperlinks_attributes normalizedData[it[0]] = Normalize(data, it) data = normalizedData[it[0]].normalize() for k in normalizeCategory['2'].items(): print "normalizing", k normalizedData[k[0]] = Normalize(data, k) data = normalizedData[k[0]].factoise() # csv_filename = 'analysis/WebcredNormalized.csv' # # pipe = Pipeline() # csv = pipe.convertjson(data) # f = open(csv_filename,'w') # f.write(csv) # f.close() except WebcredError as e: raise WebcredError(e.message) def __init__(self, url=None): # print 'here' if patternMatching: self.patternMatching = patternMatching self.hdr = {'User-Agent': 'Mozilla/5.0'} self.requests = self.urllibreq = self.soup = self.text = None self.netloc = self.header = self.lastmod = self.size = \ self.html = self.domain = self.loadTime = None self.lock = threading.Lock() if url: if not validators.url(url): raise WebcredError('Provide a valid url') self.url = url self.originalUrl = copy.deepcopy(url) # case of redirections resp = self.getrequests() if resp.status_code / 100 >= 4: raise WebcredError('Response 202') self.url = resp.url else: raise WebcredError('Provide a url') def getloadtime(self): return self.loadTime def getoriginalurl(self): return self.originalUrl def getjson(self): return self.getrequests().json() def geturl(self): return self.url def gethdr(self): return self.hdr def getheader(self): if not self.header: self.header = self.geturllibreq().headers return self.header def getrequests(self): if not self.requests: self.requests = self.geturllibreq() return self.requests def geturllibreq(self): # with self.lock: if not self.urllibreq: try: now = datetime.now() self.urllibreq = requests.get(url=self.url, headers=self.hdr) self.loadTime = int((datetime.now() - now).total_seconds()) except Exception: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() # Extract unformatter stack traces as tuples trace_back = traceback.extract_tb(ex_traceback) # Format stacktrace stack_trace = list() for trace in trace_back: stack_trace.append( "File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3])) # print("Exception type : %s " % ex_type.__name__) raise WebcredError(ex_value) # logger.info(stack_trace) # HACK if it's not webcred error, # then probably it's python error # print self.urllibreq.geturl() return self.urllibreq def clean_html(self, html): """ Copied from NLTK package. Remove HTML markup from the given string. :param html: the HTML string to be cleaned :type html: str :rtype: str """ # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) # Then we remove html comments. # This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip() def gettext(self): if not self.text: text = self.gethtml() text = self.clean_html(text) self.text = html2text(text) return self.text def gethtml(self): if not self.html: self.html = self.getrequests().text return self.html def getsoup(self, parser='html.parser'): data = self.getrequests().text try: self.soup = BeautifulSoup(data, parser) except: raise WebcredError('Error while parsing using bs4') return self.soup def getnetloc(self): if not self.netloc: try: parsed_uri = urlparse(self.geturl()) self.netloc = '{uri.netloc}'.format(uri=parsed_uri) except: logger.debug('Error while fetching attributes from parsed_uri') return self.netloc def getdomain(self): if not self.domain: try: netloc = self.getnetloc() self.domain = netloc.split('.')[-1] except: raise WebcredError('provided {} not valid'.format(netloc)) return self.domain def getPatternObj(self): try: return self.patternMatching except: raise WebcredError('Pattern Obj is NA') # self.isoList = def getsize(self): if not self.size: t = self.gettext() try: self.size = len(t) except: raise WebcredError('error in retrieving length') return self.size def getlastmod(self): if self.lastmod: return self.lastmod try: data = None # fetching data form archive for i in range(15): uri = "http://archive.org/wayback/available?url=" + \ self.geturl() uri = Urlattributes(uri) resp = uri.geturllibreq() if resp.status_code / 100 < 4: resp = resp.json() try: data = arrow.get( resp['archived_snapshots']['closest']['timestamp'], 'YYYYMMDDHHmmss').timestamp except: data = str(0) if data: self.lastmod = int(data) break # if not data: # resp = self.geturllibreq() # lastmod = str(resp.headers.getdate('Date')) # 'Mon, 09 Jul 2018 07:29:16 GMT' # Error z directive is bad format # lastmod = datetime.strptime( # str(lastmod), '(%a, %d %b %Y %H:%M:%S %z)' # ) # lastmod = datetime.strptime( # we.headers.get('Date'), '(%a, %d %b %Y %H:%M:%S %z)' # ) # str(lastmod), '(%Y, %m, %d, %H, %M, %S, %f, %W, %U)' # ) # self.lastmod = lastmod.isoformat() except Exception: # Get current system exception ex_type, ex_value, ex_traceback = sys.exc_info() # Extract unformatter stack traces as tuples trace_back = traceback.extract_tb(ex_traceback) # Format stacktrace stack_trace = list() for trace in trace_back: stack_trace.append( "File : %s , Line : %d, Func.Name : %s, Message : %s" % (trace[0], trace[1], trace[2], trace[3])) # print("Exception type : %s " % ex_type.__name__) logger.info(ex_value) logger.debug(stack_trace) self.lastmod = None return self.lastmod def freemem(self): del self