def __init__(self, lang_iso=None, ads_list=None):
        if lang_iso:
            try:
                iso = open(lang_iso, "r")
                self.isoList = iso.read().split()
                isoList = []
                for code in self.isoList:
                    # isoList.pop(iso)
                    isoList.append(str('=' + code))
                    isoList.append(str('="' + code + '"'))
                self.isoList = isoList
                self.isoPattern = self.regexCompile(self.isoList)
                iso.close()
            except WebcredError as e:
                raise WebcredError(e.message)
            except:
                raise WebcredError('Unable to open {} file'.format(lang_iso))
        else:
            logger.debug('Provide Language iso file')

        if ads_list:
            try:
                ads = open(ads_list, "r")
                self.adsList = ads.read().split()
                self.adsPattern = self.regexCompile(self.adsList)
                ads.close()
                print('successfull with ads compilation')
            except WebcredError as e:
                raise WebcredError(e.message)
            except:
                raise WebcredError('Unable to open {} file'.format(ads_list))
        else:
            logger.debug('Provide a good ads list')
    def factoise(self):
        if not self.factorise:
            raise WebcredError('Provide attr to factorise')
        global lastmodMaxMonths

        for index in range(len(self.data)):
            if self.data[index].get(self.name):
                modified = 0

                # condition for lastmod
                if self.name == "lastmod":
                    value = self.data[index][self.name]
                    value = self.getDateDifference(value)
                    if value < lastmodMaxMonths:
                        self.data[index][self.name] = self.factorise.get(
                            lastmodMaxMonths)
                        modified = 1

                # condition for everything else
                else:
                    value = self.data[index][self.name]
                    for k, v in self.factorise.items():
                        if str(value) == str(k):
                            self.data[index][self.name] = v
                            modified = 1
                if not modified:
                    if 'else' in self.factorise.keys():
                        self.data[index][self.name] = self.factorise.get(
                            'else')
        return self.data
    def geturllibreq(self):
        # with self.lock:
        if not self.urllibreq:
            try:
                now = datetime.now()
                self.urllibreq = requests.get(url=self.url, headers=self.hdr)
                self.loadTime = int((datetime.now() - now).total_seconds())
            except Exception:
                # Get current system exception
                ex_type, ex_value, ex_traceback = sys.exc_info()

                # Extract unformatter stack traces as tuples
                trace_back = traceback.extract_tb(ex_traceback)

                # Format stacktrace
                stack_trace = list()

                for trace in trace_back:
                    stack_trace.append(
                        "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                        (trace[0], trace[1], trace[2], trace[3]))

                # print("Exception type : %s " % ex_type.__name__)
                raise WebcredError(ex_value)
                # logger.info(stack_trace)
                # HACK if it's not webcred error,
                #  then probably it's python error

        # print self.urllibreq.geturl()
        return self.urllibreq
 def getsize(self):
     if not self.size:
         t = self.gettext()
         try:
             self.size = len(t)
         except:
             raise WebcredError('error in retrieving length')
     return self.size
    def getsoup(self, parser='html.parser'):
        data = self.getrequests().text
        try:
            self.soup = BeautifulSoup(data, parser)
        except:
            raise WebcredError('Error while parsing using bs4')

        return self.soup
    def regexMatch(self, pattern=None, data=None):

        if not pattern:
            raise WebcredError('Provide regex pattern')

        if not data:
            raise WebcredError('Provide data to match with pattern')

        for element in pattern:
            match = element.search(data)
            if match:
                break

        if match:
            return True, element.pattern
        else:
            return False, None
    def getdomain(self):
        if not self.domain:
            try:
                netloc = self.getnetloc()
                self.domain = netloc.split('.')[-1]
            except:
                raise WebcredError('provided {} not valid'.format(netloc))

        return self.domain
    def regexCompile(self, data=None):
        if not data:
            raise WebcredError('Provide data to compile')

        pattern = []
        for element in data:
            temp = re.compile(re.escape(element), re.X)
            pattern.append(temp)
        return pattern
Exemple #9
0
def dimapi(url, api):
    # REVIEW
    try:
        uri = Urlattributes(api)
        raw = uri.gettext()
        # result = literal_eval(raw[1:-2])
        return raw
    except WebcredError:
        raise WebcredError("Give valid API")
    except:
        return 'NA'
    def __init__(self, url=None):
        # print 'here'
        if patternMatching:
            self.patternMatching = patternMatching

        self.hdr = {'User-Agent': 'Mozilla/5.0'}
        self.requests = self.urllibreq = self.soup = self.text = None
        self.netloc = self.header = self.lastmod = self.size = \
            self.html = self.domain = self.loadTime = None
        self.lock = threading.Lock()
        if url:
            if not validators.url(url):
                raise WebcredError('Provide a valid url')
            self.url = url
            self.originalUrl = copy.deepcopy(url)

            # case of redirections
            resp = self.getrequests()
            if resp.status_code / 100 >= 4:
                raise WebcredError('Response 202')
            self.url = resp.url

        else:
            raise WebcredError('Provide a url')
    def __init__(self, data=None, name=None):
        if not data or not name:
            raise WebcredError('Need 3 args, 2 pass')

        self.reverse = self.dataList = self.mean = self.deviation = None
        self.factorise = None

        self.data = data
        self.name = name[0]

        if isinstance(name[1], str):
            if name[1] == 'reverse':
                self.reverse = True

        elif isinstance(name[1], dict):
            self.factorise = name[1]
 def getPatternObj(self):
     try:
         return self.patternMatching
     except:
         raise WebcredError('Pattern Obj is NA')
Exemple #13
0
class Urlattributes(object):
    # HACK come back and do this properly
    try:
        # TODO fetch ads list dynamically from org
        if not patternMatching:
            patternMatching = PatternMatching(
                lang_iso='data/essentials/lang_iso.txt',
                ads_list='data/essentials/easylist.txt')
            print 'end patternMatching'

        global normalizedData
        global normalizeCategory
        if not normalizedData:
            normalizedData = {}
            # read existing data
            old_data = 'data/json/data2.json'
            old_data = open(old_data, 'r').read()
            old_data = old_data.split('\n')
            new_data = 'data/json/new_data.json'
            new_data = open(new_data, 'r').read()
            new_data = new_data.split('\n')
            re_data = 'data/json/re_data.json'
            re_data = open(re_data, 'r').read()
            re_data = re_data.split('\n')

            # list with string/buffer as values
            file_ = list(set(new_data + old_data + re_data))

            # final json_List of data
            data = []
            for element in file_[:-1]:
                try:
                    metadata = json.loads(str(element))
                    # if metadata.get('redirected'):
                    #     url = metadata['redirected']
                    # else:
                    #     url = metadata['Url']
                    # obj = utils.Domain(url)
                    # url = obj.getnetloc()
                    # metadata['domain_similarity'] = scorefile_data.get(url)
                except:
                    continue
                if metadata.get('Error'):
                    continue
                data.append(metadata)

            # get data from postgres
            db = Database(Features)
            data = db.getdbdata()

            it = normalizeCategory['3'].items()
            for k in it:
                normalizedData[k[0]] = Normalize(data, k)
                data = normalizedData[k[0]].normalize()

            it = normalizeCategory['misc'].items()[0]
            # summation of hyperlinks_attribute values
            for index in range(len(data)):
                if data[index].get(it[0]):
                    sum_hyperlinks_attributes = 0
                    tempData = data[index].get(it[0])
                    try:
                        for k, v in tempData.items():
                            sum_hyperlinks_attributes += v
                    except:
                        # TimeOut error clause
                        pass
                    finally:
                        data[index][it[0]] = sum_hyperlinks_attributes

            normalizedData[it[0]] = Normalize(data, it)
            data = normalizedData[it[0]].normalize()

            for k in normalizeCategory['2'].items():
                print "normalizing", k
                normalizedData[k[0]] = Normalize(data, k)
                data = normalizedData[k[0]].factoise()

            # csv_filename = 'analysis/WebcredNormalized.csv'
            #
            # pipe = Pipeline()
            # csv = pipe.convertjson(data)
            # f = open(csv_filename,'w')
            # f.write(csv)
            # f.close()

    except WebcredError as e:
        raise WebcredError(e.message)

    def __init__(self, url=None):
        # print 'here'
        if patternMatching:
            self.patternMatching = patternMatching

        self.hdr = {'User-Agent': 'Mozilla/5.0'}
        self.requests = self.urllibreq = self.soup = self.text = None
        self.netloc = self.header = self.lastmod = self.size = \
            self.html = self.domain = self.loadTime = None
        self.lock = threading.Lock()
        if url:
            if not validators.url(url):
                raise WebcredError('Provide a valid url')
            self.url = url
            self.originalUrl = copy.deepcopy(url)

            # case of redirections
            resp = self.getrequests()
            if resp.status_code / 100 >= 4:
                raise WebcredError('Response 202')
            self.url = resp.url

        else:
            raise WebcredError('Provide a url')

    def getloadtime(self):
        return self.loadTime

    def getoriginalurl(self):
        return self.originalUrl

    def getjson(self):
        return self.getrequests().json()

    def geturl(self):
        return self.url

    def gethdr(self):
        return self.hdr

    def getheader(self):
        if not self.header:
            self.header = self.geturllibreq().headers

        return self.header

    def getrequests(self):
        if not self.requests:
            self.requests = self.geturllibreq()

        return self.requests

    def geturllibreq(self):
        # with self.lock:
        if not self.urllibreq:
            try:
                now = datetime.now()
                self.urllibreq = requests.get(url=self.url, headers=self.hdr)
                self.loadTime = int((datetime.now() - now).total_seconds())
            except Exception:
                # Get current system exception
                ex_type, ex_value, ex_traceback = sys.exc_info()

                # Extract unformatter stack traces as tuples
                trace_back = traceback.extract_tb(ex_traceback)

                # Format stacktrace
                stack_trace = list()

                for trace in trace_back:
                    stack_trace.append(
                        "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                        (trace[0], trace[1], trace[2], trace[3]))

                # print("Exception type : %s " % ex_type.__name__)
                raise WebcredError(ex_value)
                # logger.info(stack_trace)
                # HACK if it's not webcred error,
                #  then probably it's python error

        # print self.urllibreq.geturl()
        return self.urllibreq

    def clean_html(self, html):
        """
        Copied from NLTK package.
        Remove HTML markup from the given string.

        :param html: the HTML string to be cleaned
        :type html: str
        :rtype: str
        """

        # First we remove inline JavaScript/CSS:
        cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "",
                         html.strip())
        # Then we remove html comments.
        # This has to be done before removing regular
        # tags since comments can contain '>' characters.
        cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
        # Next we can remove the remaining tags:
        cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
        # Finally, we deal with whitespace
        cleaned = re.sub(r"&nbsp;", " ", cleaned)
        cleaned = re.sub(r"  ", " ", cleaned)
        cleaned = re.sub(r"  ", " ", cleaned)
        return cleaned.strip()

    def gettext(self):
        if not self.text:
            text = self.gethtml()
            text = self.clean_html(text)
            self.text = html2text(text)

        return self.text

    def gethtml(self):
        if not self.html:
            self.html = self.getrequests().text
        return self.html

    def getsoup(self, parser='html.parser'):
        data = self.getrequests().text
        try:
            self.soup = BeautifulSoup(data, parser)
        except:
            raise WebcredError('Error while parsing using bs4')

        return self.soup

    def getnetloc(self):
        if not self.netloc:
            try:
                parsed_uri = urlparse(self.geturl())
                self.netloc = '{uri.netloc}'.format(uri=parsed_uri)
            except:
                logger.debug('Error while fetching attributes from parsed_uri')

        return self.netloc

    def getdomain(self):
        if not self.domain:
            try:
                netloc = self.getnetloc()
                self.domain = netloc.split('.')[-1]
            except:
                raise WebcredError('provided {} not valid'.format(netloc))

        return self.domain

    def getPatternObj(self):
        try:
            return self.patternMatching
        except:
            raise WebcredError('Pattern Obj is NA')

        # self.isoList =

    def getsize(self):
        if not self.size:
            t = self.gettext()
            try:
                self.size = len(t)
            except:
                raise WebcredError('error in retrieving length')
        return self.size

    def getlastmod(self):

        if self.lastmod:
            return self.lastmod

        try:
            data = None
            # fetching data form archive
            for i in range(15):
                uri = "http://archive.org/wayback/available?url=" + \
                      self.geturl()
                uri = Urlattributes(uri)
                resp = uri.geturllibreq()
                if resp.status_code / 100 < 4:
                    resp = resp.json()
                    try:
                        data = arrow.get(
                            resp['archived_snapshots']['closest']['timestamp'],
                            'YYYYMMDDHHmmss').timestamp
                    except:
                        data = str(0)
                if data:
                    self.lastmod = int(data)
                    break
        #     if not data:
        #         resp = self.geturllibreq()
        #         lastmod = str(resp.headers.getdate('Date'))
        #             'Mon, 09 Jul 2018 07:29:16 GMT'
        #              Error z directive is bad format
        #         lastmod = datetime.strptime(
        #             str(lastmod), '(%a, %d %b %Y %H:%M:%S %z)'
        #         )
        #         lastmod = datetime.strptime(
        #             we.headers.get('Date'), '(%a, %d %b %Y %H:%M:%S %z)'
        #         )
        #             str(lastmod), '(%Y, %m, %d, %H, %M, %S, %f, %W, %U)'
        #         )
        #         self.lastmod = lastmod.isoformat()
        except Exception:
            # Get current system exception
            ex_type, ex_value, ex_traceback = sys.exc_info()

            # Extract unformatter stack traces as tuples
            trace_back = traceback.extract_tb(ex_traceback)

            # Format stacktrace
            stack_trace = list()

            for trace in trace_back:
                stack_trace.append(
                    "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                    (trace[0], trace[1], trace[2], trace[3]))

            # print("Exception type : %s " % ex_type.__name__)
            logger.info(ex_value)
            logger.debug(stack_trace)
            self.lastmod = None

        return self.lastmod

    def freemem(self):
        del self