def __init__(self, company_name):
     super().__init__(company_name)
     self.company_name = WebpageResolver(company_name).company_name
     try:
         self.cache = pd.read_csv(
             Scamwatcher.LOC+"cache.tsv", sep='\t', index_col='company')
     except FileNotFoundError:
         self.cache = pd.DataFrame(columns=['company', 'rank'])
         self.cache.set_index('company')
 def __init__(self, company_name):
     super().__init__(None)
     self.tax_havens = TaxHeaven().return_data()['tax_heaven']
     self.cache = Cache('modules/WHO_IS/cache')
     resolv = WebpageResolver(company_name)
     self.company_name = resolv.company_name
     try:
         res = resolv.return_data()['webpage']
         self.webpages = list(set(res))
     except IndexError as e:
         print("WEBPAGE NOT FOUND")
         raise e
Beispiel #3
0
    def return_data(self, **kwargs) -> dict:
        """ Returns Alexa Rank score in 0-4 scale.
             0 - high
             1 - moderate 
             2 - low
             3 - very low
             4 - not indexed
        """
        if self.company_name in self.cache.index:
            result = self.cache.loc[self.company_name].values[0]
            return {"AlexaRank": result}

        found = []
        found_full = []
        for webpage in self.webpages:
            page = WebpageResolver.get_html(AlexaRank.ALEXA_ROOT+webpage, stash=False)

            try:
                soup = bs4.BeautifulSoup(page, features="lxml")
                rank = soup.find_all("div", class_="rankmini-rank")[0].text.strip()
                rank = int(rank.lstrip("#").replace(",",""))

                rank_digit = np.digitize(rank, AlexaRank.BINS)
                found.append(rank_digit)
                found_full.append(rank)
            except IndexError:
                # The page is so small that it's not even indexed in Alexa
                found.append(4)
                found_full.append(-1)

        rank_digit = min(found)
        rank = min(found_full)
        self.cache.loc[self.company_name] = rank
        self.cache.to_csv(AlexaRank.LOC+"cache.tsv", sep='\t')
        return {"AlexaRank": rank_digit, "AlexaRankScore": rank}
Beispiel #4
0
    def check_if_polish_text(self, website):
        def tag_visible(element):
            if element.parent.name in [
                    'style', 'script', 'head', 'title', 'meta', '[document]'
            ]:
                return False
            if isinstance(element, bs4.element.Comment):
                return False
            return True

        def text_from_html(body):
            soup = BeautifulSoup(body, 'html.parser')
            texts = soup.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
            return u" ".join(t.strip() for t in visible_texts)

        for website in self.websites:
            try:
                text = text_from_html(WebpageResolver.get_html(website))
                ld = LanguageDetection()
                langs = ld.return_data(text=text)
                #print(langs, website)
            except:
                continue
            if 'pl' in langs and langs['pl'] > 0.25:
                return True
            return False
        return False
Beispiel #5
0
 def __init__(self, company_name):
     super().__init__(company_name)
     self.websites = WebpageResolver(company_name).return_data()['webpage']
     try:
         self.cache = pd.read_csv(PolandCheck.LOC + "cache.tsv",
                                  sep='\t',
                                  index_col='company')
     except FileNotFoundError:
         self.cache = pd.DataFrame(columns=['company', 'rank'])
         self.cache.set_index('company')
class Scamwatcher(DataSource):
    LOC = "modules/SCAMWATCHER/"
    PAGE_ROOT = "https://www.scamwatcher.org/{0}-review/"
    BINS = [5000, 30000, 70000]

    def __init__(self, company_name):
        super().__init__(company_name)
        self.company_name = WebpageResolver(company_name).company_name
        try:
            self.cache = pd.read_csv(
                Scamwatcher.LOC+"cache.tsv", sep='\t', index_col='company')
        except FileNotFoundError:
            self.cache = pd.DataFrame(columns=['company', 'rank'])
            self.cache.set_index('company')
            
    def return_data(self, **kwargs) -> dict:
        """ Key: Scamwatcher """
        if self.company_name in self.cache.index:
            data = self.cache.loc[self.company_name, 'rank']
            return {"Scamwatcher": str(data)}

        page = Scamwatcher.PAGE_ROOT.format(self.company_name).replace(" ", "-")
        res = requests.get(page)
        found = "Oops! That page" not in res.text
        if not found:
            page = Scamwatcher.PAGE_ROOT.format(' '.join(self.company_name.split()[:-1])).replace(" ", "-")
            res = requests.get(page)
            found = "Oops! That page" not in res.text

        if not found: 
            page = Scamwatcher.PAGE_ROOT.format(self.company_name.lower().replace("ltd", "limited")).replace(" ", "-")
            print(page)
            res = requests.get(page)
            found = "Oops! That page" not in res.text

        self.cache.loc[self.company_name, 'rank'] = found
        self.cache.to_csv(Scamwatcher.LOC+"cache.tsv", sep='\t')
        return {"Scamwatcher": str(bool(found))}
Beispiel #7
0
 def __init__(self, company_name):
     super().__init__(company_name)
     try:
         self.cache = pd.read_csv(
             AlexaRank.LOC+"cache.tsv", sep='\t', index_col='company')
     except FileNotFoundError:
         self.cache = pd.DataFrame(columns=['company', 'rank'])
         self.cache.set_index('company')
     try:
         res = WebpageResolver(company_name).return_data()['webpage']
         self.webpages = res
     except IndexError as e:
         print("WEBPAGE NOT FOUND")
         raise e
def run_scrapper():
    df = pd.read_csv(MAIN_DATA,
                     sep='\t',
                     quotechar="\'",
                     error_bad_lines=False,
                     quoting=csv.QUOTE_NONE)
    with tqdm(df['name'].iloc[667 + 753 + 4099:]) as t:
        for company_name in t:
            t.set_postfix(company_name=company_name)
            try:
                res = WebpageResolver(company_name).return_data()['webpage']
            except (UnicodeError, requests.exceptions.InvalidURL,
                    requests.exceptions.MissingSchema, AttributeError,
                    requests.exceptions.ConnectionError):
                continue
Beispiel #9
0
class BuiltWith(DataSource):
    def __init__(self, company_name):
        super().__init__(company_name)
        self.cache = Cache("modules/BUILTWITH/cache")
        self.resolv = WebpageResolver(company_name)
        self.company_name = self.resolv.company_name

    def return_data(self):
        temp_cache = self.cache.check_cache(self.company_name)
        if temp_cache is not None:
            return {"BuiltWith": temp_cache}

        out = []
        for link in self.resolv.return_data()['webpage']:
            try:
                res = builtwith.builtwith(link)
                if res not in out:
                    out.append(res)
            except Exception as e:
                print(e, "i co z tego")

        self.cache.append([self.company_name, out])
        return {"BuiltWith": out}
Beispiel #10
0
 def __init__(self, company_name):
     super().__init__(company_name)
     resolv = WebpageResolver(company_name)
     self.company_name = resolv.company_name
     self.data_sources = {"Webpages": resolv.cache}
Beispiel #11
0
 def __init__(self, company_name):
     super().__init__(company_name)
     self.cache = Cache("modules/BUILTWITH/cache")
     self.resolv = WebpageResolver(company_name)
     self.company_name = self.resolv.company_name
from .network import Network
from modules import WebpageResolver
import numpy as np

# WORKING EXAMPLES: Chinatsu and Partners
cache = WebpageResolver('Mango').cache.index
to_search = np.random.choice(cache, size=10)
for i in to_search:
    print(i)
    module = Network(i)
    print(i, module.return_data())

print(module.find_company("bitcoin"))
Beispiel #13
0
def get_WebpageResolver():
    res = WebpageResolver(request.args['name'].lower()).return_data()
    res = {i: str(j) for i, j in res.items()}
    return jsonify(res)