Exemple #1
0
def dimapi(url, api):
    # REVIEW
    try:
        uri = Urlattributes(api)
        raw = uri.gettext()
        # result = literal_eval(raw[1:-2])
        return raw
    except WebcredError:
        raise WebcredError("Give valid API")
    except:
        return 'NA'
Exemple #2
0
def funcBrokenllinks(url):
    result = False
    if url:
        try:
            Urlattributes(url)
        except WebcredError:
            result = True
    return result
Exemple #3
0
def getImgratio(url):

    total_img_size = 0
    threads = []

    text_size = url.getsize()

    soup = url.getsoup()

    # total_img_size of images
    for link in soup.find_all('img', src=True):
        uri = link.get('src', None)
        if not uri.startswith('http://') and not uri.startswith('https://'):
            uri = url.geturl() + uri

        if validators.url(uri):
            try:
                uri = Urlattributes(uri)
                Method = funcImgratio
                Name = 'Imgratio'
                Url = uri
                func = Method
                t = MyThread(func, Name, Url)
                t.start()
                threads.append(t)
            except Exception:
                # Get current system exception
                ex_type, ex_value, ex_traceback = sys.exc_info()

                # Extract unformatter stack traces as tuples
                trace_back = traceback.extract_tb(ex_traceback)

                # Format stacktrace
                stack_trace = list()

                for trace in trace_back:
                    stack_trace.append(
                        "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                        (trace[0], trace[1], trace[2], trace[3]))

                # print("Exception type : %s " % ex_type.__name__)
                if ex_value.message != 'Response 202':
                    logger.warning(ex_value)
                    logger.debug(stack_trace)

    for t in threads:
        t.join()
        size = t.getResult()
        t.freemem()
        if isinstance(size, int):
            total_img_size += size
        # print total_img_size

    total_size = total_img_size + text_size
    ratio = float(text_size) / total_size

    return ratio
Exemple #4
0
def googleinlink(url):

    API_KEY = os.environ.get('Google_API_KEY')

    inlinks = None
    try:
        # keyword link is used in search query to search only hyperlinks
        uri = ('https://www.googleapis.com/customsearch/v1?key=' + API_KEY +
               '&cx=017576662512468239146:omuauf_lfve&q=link:' +
               url.getoriginalurl())
        uri = Urlattributes(uri)
        txt = uri.gettext()

        for line in txt.splitlines():
            if "totalResults" in line:
                break
        inlinks = int(re.sub("[^0-9]", "", line))

    except Exception:
        # Get current system exception
        ex_type, ex_value, ex_traceback = sys.exc_info()

        # Extract unformatter stack traces as tuples
        trace_back = traceback.extract_tb(ex_traceback)

        # Format stacktrace
        stack_trace = list()

        for trace in trace_back:
            stack_trace.append(
                "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                (trace[0], trace[1], trace[2], trace[3]))

        # print("Exception type : %s " % ex_type.__name__)
        logger.info('Inlinks error {}'.format(ex_value))
        logger.debug(stack_trace)

    return inlinks
Exemple #5
0
def getAlexarank(url):

    if not isinstance(url, Urlattributes):
        url = Urlattributes(url)

    uri = "http://data.alexa.com/data?cli=10&dat=s&url=" + url.geturl()
    uri = Urlattributes(uri)
    soup = uri.getsoup()
    try:
        rank = soup.find("reach")['rank']
    except:
        rank = None
    return rank
Exemple #6
0
def getWot(url):

    if not isinstance(url, Urlattributes):
        url = Urlattributes(url)

    result = ("http://api.mywot.com/0.4/public_link_json2?hosts=" +
              url.geturl() +
              "/&callback=&key=d60fa334759ae377ceb9cd679dfa22aec57ed998")
    uri = Urlattributes(result)
    raw = uri.gettext()
    result = literal_eval(raw[1:-4])
    result = str(result).split(']')[0].split('[')[-1].split(',')
    data = None
    if isinstance(result, list) and len(result) == 2:
        data = {}
        data['reputation'] = int(result[0])
        data['confidence'] = int(result[1])
    return data
    def assess(self):

        now = datetime.now()

        if not isinstance(self.request, dict):
            self.request = dict(self.request.args)

        data = {}
        req = {}
        req['args'] = {}
        percentage = {}
        site = None
        dump = True
        try:
            # get percentage of each feature
            # and copy self.request to req['args']
            # TODO come back and do this properly
            for keys in apiList.keys():
                if self.request.get(keys, None):
                    # because self.request.args is of ImmutableMultiDict form
                    if isinstance(self.request.get(keys, None), list):
                        req['args'][keys] = str(self.request.get(keys)[0])
                        perc = keys + "Perc"
                        if self.request.get(perc):
                            percentage[keys] = self.request.get(perc)[0]
                    else:
                        req['args'][keys] = self.request.get(keys)
                        perc = keys + "Perc"
                        if self.request.get(perc):
                            percentage[keys] = self.request.get(perc)

            # to show wot ranking
            # req['args']['wot'] = "true"
            data['url'] = req['args']['site']

            site = Urlattributes(url=req['args'].get('site', None))

            # get genre
            # WARNING there can be some issue with it
            data['genre'] = self.request.get('genre', None)

            if data['url'] != site.geturl():
                data['redirected'] = site.geturl()

            data['lastmod'] = site.getlastmod()

            # site is not a WEBCred parameter
            del req['args']['site']

            # check database,
            # if url is already present?
            if self.db.filter('url', data['url']).count():
                '''
                if lastmod not changed
                    update only the columns with None value
                else update every column
                '''
                if self.db.filter(
                        'lastmod',
                        data['lastmod']).count() or not data['lastmod']:
                    # get all existing data in dict format
                    data = self.db.getdata('url', data['url'])

                    # check the ones from columns which have non None value
                    '''
                    None value indicates that feature has not
                    successfully extracted yet
                    '''
                    for k, v in data.items():
                        if v or str(v) == '0':
                            # always assess loadtime
                            if k != 'pageloadtime':
                                req['args'][k] = 'false'
                    dump = False
                else:
                    data = self.db.getdata('url', data['url'])

            data = self.extractValue(req, apiList, data, site)

            # HACK 13 is calculated number, refer to index.html, where new
            # dimensions are dynamically added
            # create percentage dictionary
            number = 13
            # TODO come back and do this properly

            print()
            print()
            print()
            print("no error 1")
            print()
            print()
            print()

            while True:
                dim = "dimension" + str(number)
                API = "api" + str(number)
                if dim in self.request.keys():
                    try:
                        data[self.request.get(dim)[0]] = surface.dimapi(
                            site.geturl(),
                            self.request.get(API)[0])
                        perc = API + "Perc"
                        percentage[dim] = self.request.get(perc)[0]
                    except WebcredError as e:
                        data[self.request.get(dim)[0]] = e.message
                    except:
                        data[self.request.get(dim)[0]] = "Fatal ERROR"
                else:
                    break
                number += 1

            print()
            print()
            print()
            print("no error 2")
            print()
            print()
            print()

            data = webcredScore(data, percentage)

            data['error'] = None
            print("data1error                                    ",
                  data['error'])

        except WebcredError as e:
            data['error'] = e.message
            print('python error')
            print()
            dump = False
        except Exception:
            # Get current system exception
            ex_type, ex_value, ex_traceback = sys.exc_info()

            # Extract unformatter stack traces as tuples
            trace_back = traceback.extract_tb(ex_traceback)

            # Format stacktrace
            stack_trace = list()

            for trace in trace_back:
                stack_trace.append(
                    "File : %s , Line : %d, Func.Name : %s, Message : %s" %
                    (trace[0], trace[1], trace[2], trace[3]))

            # print("Exception type : %s " % ex_type.__name__)
            logger.info(ex_value)
            logger.debug(stack_trace)
            # HACK if it's not webcred error,
            #  then probably it's python error
            data['error'] = 'Fatal Error'
            dump = False
            logger.debug(data['url'])
        finally:

            now = str((datetime.now() - now).total_seconds())
            data['assess_time'] = now

            # store it in data
            self.db.update('url', data['url'], data)

            # dump text and html of html
            if dump:
                self.dumpRaw(site)

            data = self.db.getdata('url', data['url'])

            # prevent users to know of dump location
            del data['html']
            del data['text']

            logger.debug(data['url'])

            logger.debug('Time = {}'.format(now))

            print("data2error                                    ",
                  data['error'])

            return data