예제 #1
0
class LansWeeper(object):

    def __init__(self, baselink, username=None, password=None):
        self.browser = Browser()
        self.baselink = str(baselink).strip("/") + "/"
        self.browser.open_page(self.baselink + 'login.aspx')
        try:
            if username == None:
                self.browser.sendClick("//input[@id='defaultuser']")
            else:
                self.__login(username, password)
        except:
            pass

    def __login(self, username, password):
        self.browser.sendData(username, "//input[@id='NameTextBox']")
        self.browser.sendData(password, "//input[@id='PasswordInput']", True)

    def getAssestsLInks(self):
        page = BeautifulSoup(self.browser.getContent(self.baselink + "Assets.aspx"), "html5lib")
        assests = page.find(id="appendbody").find_all("tr")
        links = []
        for assest in assests:
            links.append(assest.a["href"])
        return links

    def getAssestsData(self, link):
        myassest = {
            'Asset Type': "",
            'Domain': "",
            'OS': "",
            'Build': "",
            'Version': "",
            'Manufacturer': "",
            'Model': '',
            'SKU': '',
            'Memory': '',
            'Processor': '',
            'Motherboard': '',
            'Graphics': '',
            'Audio': '',
            'Antivirus': '',
            'Network': '',
            'Harddisk': '',
        }
        page = BeautifulSoup(self.browser.getContent(self.baselink + link), "html5lib")
        try:
            table = page.find(id="assetcontent").find("table", class_="compmenu").find_next_sibling("table").find_all("td")
        except:
            raise Exception("Table not found")
        for element in table[0].find_all("tr"):
            td = element.find_all("td")
            key = re.sub(' +'," ",str(td[0].contents[0].string).rstrip(":")).strip(" ")
            if key in myassest.keys():
                if key in ("Asset Type", "Last user", "Domain", "Manufacturer", "Model"):
                    myassest[key] = str(td[1].a.string).replace("\n", " ").replace("\r", " ").strip(" ")
                elif key in ("Antivirus", "Network"):
                    item = td[1].find_all("span")
                    nbr_items = int(len(item) / 3)
                    itemlist = []
                    j = 0
                    for i in range(0, nbr_items):
                        software = '{} {} {}'.format(item[j].string.replace("\n", " ").replace("\r", " ").strip(" "),
                                                     item[j + 1].string.replace("\n", " ").replace("\r", " ").strip(
                                                         " "),
                                                     item[j + 2].string.replace("\n", " ").replace("\r", " ").strip(
                                                         " "))
                        j += 3
                        itemlist.append(software)
                    myassest[key] = re.sub(' +', ' ', ';'.join(itemlist))
                elif key == "Harddisk":
                    harddrives = td[1].find_all("table", recursive=False)
                    harddrives_labels = td[1].find_all("img", recursive=False)
                    HD_lists = []
                    j = 0
                    for drive in harddrives:
                        HD = str(harddrives_labels[j].next_sibling).replace("\n", " ").replace("\r", " ").strip(
                            " ") + str(
                            drive.find_all("tr")[0].find_all("td", recursive=False)[1].span.string).replace("\n",
                                                                                                            " ").replace(
                            '\xa0', '').replace("\r", " ").strip(" ")
                        j += 1
                        HD_lists.append(HD)
                    myassest[key] = re.sub(' +', ' ', ';'.join(HD_lists))
                else:
                    myassest[key] = re.sub(" +", " ", str(td[1].img.next_sibling).replace("\n", " ").replace("\r", " ").strip(" "))
        return myassest
예제 #2
0
class Books:
    def __init__(self, path=None, arabic=True):
        self.arabic = arabic
        # Browsing and writing managers
        self.br = Browser()
        self.wr = Writer(path) if path else Writer()
        # An array for scrapped books
        self._books_ids = []

    # Append an external books ids array to local array
    def append_books(self, books_ids):
        # Loop through sent books ids
        for book_id in books_ids:
            # Only append id if it's not stored already
            if book_id not in self._books_ids:
                self._books_ids.append(book_id)

    # Scrape books and write them to a file (browse is: list, lists, author or shelf)
    def output_books(self, keyword=None, browse="list", file_name="books"):
        self.wr.open(file_name, "w+")
        # Get books if keyword is provided, otherwise output stored books
        books_ids = self.get_books(keyword, browse) if keyword else self._books_ids
        # Loop through book ids and write them
        for book_id in books_ids:
            self.wr.write(book_id)
        self.wr.close()

    def output_books_editions(self, books_ids=None, file_name="editions"):
        skip = len(read_books(file_name))
        self.wr.open(file_name, "a+")
        # Loop through book ids and write their editions id
        for book_id in books_ids[skip:] or self._books_ids[skip:]:
            editions_id = self.get_book_editions_id(book_id)
            # Editions id is None when page refuses to load
            if editions_id is None: return self.wr.close()
            # Write editions id to file if it loads correctly
            self.wr.write(editions_id or "-"*7)
            # Display book id and editions id
            print(f"Book ID:\t{book_id:<15}Book Editions ID:\t{editions_id or ''}")
        self.wr.close()
        return True

    def output_books_edition_by_language(self, editions_ids, lang="Arabic", file_name="ara_books"):
        skip = len(read_books(file_name))
        self.wr.open(file_name, "a+")
        # Loop through book ids and write their editions id
        for editions_id in editions_ids[skip:]:
            books_ids = self.get_book_edition_by_language(editions_id, lang) if editions_id.isdigit() else ''
            # Editions id is None when page refuses to load
            if books_ids is None: return self.wr.close()
            # Write editions id to file if it loads correctly
            self.wr.write(books_ids or "-"*7)
            # Display book id and editions id
            print(f"Book Editions ID:\t{editions_id:<15}Books IDs:\t{books_ids or ''}")
        self.wr.close()
        # Open a new file to move done list to it
        self.wr.open(file_name + "_list")
        # Loop through previously scraped editions ids
        for line in read_books(file_name):
            # If line isn't empty
            if line != "-"*7:
                # Write each book edition id in a separate line
                [self.wr.write(id_) for id_ in line.split(',')]
        self.wr.close()
        return True

    # Main function to scrape books ids
    def get_books(self, keyword, browse="list"):
        # Get lists in search list if searching
        if browse == "lists":
            keywords = self._get_lists(keyword.replace(' ', '+'))
            browse = "list"
        # Otherwise, it's a single "list" or "shelf"
        else:
            keywords = [
                str(key) for key in (
                    keyword if isinstance(keyword, list) else [keyword]
                )]
        try:
            # Loop through all lists
            for keyword in keywords:
                # Open each list url
                self.br.open_page(keyword, browse)
                # Scrape pages until there's no next page
                while True:
                    self._scrape_list("book", self._books_ids)
                    if not self.br.goto_next_page():
                        break
        except Exception as e:
            print("Couldn't go to next page:", e)
        finally:
            return self._books_ids

    def get_book_editions_id(self, book_id):
        self.br.open("/book/show/", book_id)
        return self.br.editions_id()

    def get_book_edition_by_language(self, editions_id, lang):
        self.br.open_book_editions(editions_id)
        soup = BeautifulSoup(self.br.page_source, "lxml").find(class_="workEditions")
        if not soup: return None
        editions = []
        for details in soup.find_all(class_="editionData"):
            language, rating = [row.find(class_="dataValue") for row in details.find_all(class_="dataRow")[-3:-1]]
            if language.text.strip() == lang:
                reviewers = get_digits(rating.find("span").text)
                if reviewers > 50:
                    editions.append(id_from_url.match(details.find(class_="bookTitle")["href"]).group(1))
        return ','.join(editions)

    # Main function to scrape lists ids
    def _get_lists(self, keyword):
        lists = []
        # Open GoodReads' lists search url
        self.br.open_list_search(keyword)
        # Scrape all result pages
        while True:
            self._scrape_list("list", lists)
            # Go to next page if there's one, otherwise break
            if not self.br.goto_next_page():
                break
        return lists

    # Scrape a single search results page
    def _scrape_list(self, title, array):
        soup = BeautifulSoup(self.br.page_source, "lxml").find(class_="tableList")
        if not soup: return None
        for book in soup.find_all("tr"):
            if self.arabic or get_digits(book.find(class_="minirating").text.split("—")[1]) > 1000:
                try:  # Get id from url
                    id_ = id_from_url.match(book.find(class_=title + "Title")["href"]).group(1)
                except Exception:
                    print("Couldn't extract Book Id from URL")
                    continue
                # Extract and store unique id from link
                if id_ not in array:
                    array.append(id_)
                    print(f"{title.capitalize()} {id_:<10}count:\t{len(array)}")