Exemple #1
0
 def find_organization(self):
     # search home page
     soup = BeautifulSoup(urlopen(self.link).read())
     home = soup.find('a', {'href': '/index.php'})
     if home is None:
         home = soup.find('a', text=re.compile('(H|h)ome'))
     if home is None or not home.has_attr('href'):
         self.org = Organization("No organization found")
     else:
         home_url = urljoin(self.get_base(), home['href'])
         homepage = BeautifulSoup(urlopen(home_url).read())
         org = homepage.find('title').text
         self.org = Organization(org)
         self.org.validate_in_viaf()
Exemple #2
0
 def get_organzation_by_id(self, id):
     try:
         with DB.cursor() as cursor:
             # Read a single record
             query = "SELECT * FROM organizations WHERE id='" + id + "'"
             cursor.execute(query)
             result = cursor.fetchone()
             return Organization(result)
         DB.commit()
     except Exception as e:
         raise InvalidUsage("Failed to find org with id " + id)
    def load_organizations(self):
        with open("../clubs.json", "r") as myfile:
            data = myfile.read()
        orgs = json.loads(data)

        organizations = []

        for org in orgs:
            organizations.append(
                Organization(org["name"], org["desc"], org['link'],
                             org['img']))

        return organizations
Exemple #4
0
class Resource:
    def __init__(self, url):
        link_status = check_link(url)
        if link_status is "working":
            self.link = url
            self.status = link_status
        else:
            print("Error with url.")
            self.status = link_status
            print(self.status)
            print("Please check your link (perhaps use http://www...) and try again")
            exit()

    text = ""
    title = "No title"
    status = "No status"
    resource_types = ""
    themes = ""
    org = "No organization found"
    validated = False
    resource_contact_person_name = "No contact individual found"
    resource_contact_org = "No contact org"
    resource_contact_email = "No contact email"
    resource_contact_phone = "No contact phone"
    links_found = []

    def get_org(self):
        return self.org.name

    def get_resource_data(self):
        self.build_title()
        self.find_resource_types()
        self.find_themes()
        self.find_organization()
        self.find_contact_info()

    def build_title(self):
        page_text = BeautifulSoup(urlopen(self.link).read())
        title = page_text.find('title', text=True)
        if title is not None:
            if title.has_attr('string'):
                no_generics = re.sub(generic_terms, '', title.string)
                self.title = re.sub('[^a-zA-Z0-9 -]', '', no_generics)
            else:
                no_generics = re.sub(generic_terms, '', title.text)
                self.title = re.sub('[^a-zA-Z0-9 -]', '', no_generics)
        else:
            self.title = 'No title'

    def find_links(self):
        if self.status is "working":
            try:
                soup = BeautifulSoup(urlopen(self.link, timeout=7).read())
                for link_tag in soup.find_all('a', href=True):
                    if check_link(link_tag['href']) is not "working":
                        new_url = urljoin(self.link, link_tag['href'])
                        if check_link(new_url) is "working" and new_url != self.link:
                            if new_url not in self.links_found:
                                self.links_found.append(new_url)
                    else:
                        if link_tag['href'] != self.link:
                            if link_tag['href'] not in self.links_found:
                                self.links_found.append(link_tag['href'])
            except URLError as e:
                self.status = "{} {} {}".format(self.link, e.reason)

    def get_base(self):
        ext_url = tldextract.extract(self.link)
        base = "http://" + ext_url.subdomain + "." + ext_url.domain + "." + ext_url.suffix
        return base

    def find_contact_info(self):
        page = BeautifulSoup(urlopen(self.link).read())
        contact = page.find('a', text=re.compile('(C|c)ontact.*'))
        if contact is not None:
            if contact.has_attr('href'):
                if check_link(contact['href']) is "working":
                    page = BeautifulSoup(urlopen(contact['href']))
                else:
                    base = self.get_base()
                    link = urljoin(base, contact['href'])
                    if check_link(link) is "working":
                        page = BeautifulSoup(urlopen(link).read())
        # first look for tag with class = phone
        phone = page.find({'class': 'phone'})
        if phone is not None:
            self.resource_contact_phone = phone.text
        else:
            # if not class = phone, look for phone number
            phone = page.find(text=re.compile('\+(9[976]\d|8[987530]\d|6[987]\d'
                                              '|5[90]\d|42\d|3[875]\d|2[98654321]'
                                              '\d|9[8543210]|8[6421]|6[6543210]|5'
                                              '[87654321]|4[987654310]|3[9643210]'
                                              '|2[70]|7|1)\s*(\(\d+\)|\d+)(\s|-)[0-9]+(-*)[0-9]+'))
            if phone is not None:
                self.resource_contact_phone = phone.strip()
        email = page.find({'class': 'email'})
        if email is not None:
            self.resource_contact_email = email.text
        else:
            email = page.find(text=re.compile('[A-Za-z0-9-._]+(@|\(at\)| at )+[A-Za-z0-9-._]+\.[A-Za-z0-9-._]+'))
            if email is not None:
                self.resource_contact_email = email

    def find_organization(self):
        # search home page
        soup = BeautifulSoup(urlopen(self.link).read())
        home = soup.find('a', {'href': '/index.php'})
        if home is None:
            home = soup.find('a', text=re.compile('(H|h)ome'))
        if home is None or not home.has_attr('href'):
            self.org = Organization("No organization found")
        else:
            home_url = urljoin(self.get_base(), home['href'])
            homepage = BeautifulSoup(urlopen(home_url).read())
            org = homepage.find('title').text
            self.org = Organization(org)
            self.org.validate_in_viaf()

    def find_themes(self):
        disciplines_found = []
        set_of_disciplines = set()
        if check_type(self.link) is "FTP":
            return "None"
        if check_type(self.link) is "HTTP":
            souper = BeautifulSoup(urlopen(self.link).read())
            # Search for all keywords, the values of the Domains dict
            for key in disciplinesKnown:
                for v in disciplinesKnown.get(key):
                    # For all keywords found, filter so that only
                    # the keywords in the page's visible text are found
                    texts = souper.find_all(text=re.compile(v))
                    visible_texts = filter(visible, texts)
                    # For every keyword (value) found, add the Domain from our
                    # dictionary (key) to the list fo domains associated with
                    # the resource
                    for vis in visible_texts:
                        # Added as a set to avoid duplicates
                        set_of_disciplines.add(key)
            # Turn the set of domains back into a list
            disciplines_found = list(set_of_disciplines)
        if len(disciplines_found) > 0:
            return " - ".join(disciplines_found)
        else:
            return 'No disciplines found'

    def find_home_page(self):
        ext = tldextract.extract(self.link)
        ext_dom = ext.domain
        ext_suff = ext.suffix
        new_url = "http://www." + ext_dom + "." + ext_suff
        new_url = new_url.strip('/')
        return new_url

    def find_resource_types(self):
        set_of_resources = set()
        souper2 = BeautifulSoup(urlopen(self.link).read())
        for key in resourceTypesKnown:
            for v in resourceTypesKnown.get(key):
                texts = souper2.find_all(text=re.compile(v))
                visible_texts = filter(visible, texts)
                for vis in visible_texts:
                    set_of_resources.add(key)
        resos_found = list(set_of_resources)
        if len(resos_found) > 0:
            self.resource_types = " - ".join(resos_found)