def __init__(self):
     print self.DOMAIN
     self.logger = Logger(name='dubizzle_data_log')
     self.err_logger = Logger(name='err_dubizzle_data_log')
     self.request_manager = RequestManager()
     self.source_code_manager = SourceCodeManager()
     self.generator = Generator()
     self.db = DatabaseManager()
Beispiel #2
0
class DataExtractor:
    DOMAIN = 'dubicars.com'
    PROJECT_ID = 13

    PATH = 'phones/'

    def __init__(self):
        print self.DOMAIN
        self.logger = Logger(name='dubicars_data_log')
        self.err_logger = Logger(name='err_dubicars_data_log')
        self.request_manager = RequestManager()
        self.source_code_manager = SourceCodeManager()
        self.generator = Generator()
        self.db = DatabaseManager()
        self.trim_list = self.db.get_trim_list()

    def extract_data(self, url_data):
        print url_data
        url_id = url_data['id']
        url = url_data['url']
        listing_id = url_data['listing_id']

        data = {}

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)

        expired = parsed_code.find('img', {'class': 'sold'})
        if expired is not None:
            self.db.set_url_inactive(url_id)
            self.err_logger.error("EXPIRED " + str(url_data))

            return
        elif response['status_code'] == 404:
            self.db.set_url_inactive(url_id)
            self.err_logger.error("404 " + str(url_data))

            return

        try:
            marka = self.__find_make(parsed_code)

            year = self.__find_year(parsed_code)
            kilometres = self.__find_km(parsed_code)
            color = self.__find_color(parsed_code)
            specs = self.__find_specs(parsed_code)
            price = self.__find_price(parsed_code)
            model = self.__find_model(parsed_code, make=marka)
            trim = self.__find_trim(parsed_code, marka=marka, model=model)
            if trim == 'Other':
                self.db.set_url_processed(url_id)
                self.db.set_url_inactive(url_id)
                return
            phone = self.__find_phone(parsed_code)
        except Exception as exc:
            self.err_logger.error(str(exc) + str(url_data))

            self.db.set_url_processed(url_id)
            return
        try:
            data['year'] = int(year)
            data['price'] = int(price)
            data['kilometres'] = int(kilometres)
            data['color'] = color
            data['specs'] = specs
            data['trim'] = trim
            data['model'] = model
            data['make'] = marka
            data['phone'] = phone
            print data
        except Exception as exc:
            self.err_logger.error(str(exc) + url_data)

            self.db.set_url_processed(url_id)
            self.db.set_url_inactive(url_id)

            return

        self.db.insert_data(data=data,
                            listing_id=listing_id,
                            url=url,
                            source=self.DOMAIN)
        self.db.set_url_processed(url_id)

    def update_data(self, url_data):
        timestamp = generate_timestamp()
        url_id = url_data['id']
        listing_id = url_data['listing_id']
        print listing_id
        url = url_data['url']
        first_timestamp = url_data['timestamp']
        time_dif = first_timestamp - datetime.strptime(timestamp,
                                                       "%Y.%m.%d:%H:%M:%S")
        time_dif = time_dif.days

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)
        expired = parsed_code.find('img', {'class': 'sold'})
        if expired is not None:
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            return

        elif response['status_code'] == 404:
            print 404, listing_id
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            return

        try:
            price = self.__find_price(parsed_code)
        except:
            price = 0

        # days = self.__calc_days_on_market(listing_id)

        self.db.update_listing(listing_id=listing_id,
                               price=int(price),
                               days_on_market=time_dif)
        self.db.set_updated(listing_id=listing_id)

    def __find_make(self, code):
        try:
            make = self.__find_tag_by_text(code, text='Make:')
            return make
        except:
            return ''

    def __find_year(self, code):
        try:
            year = self.__find_tag_by_text(code, text='Year:')
            year_list = year.split()
            for year in year_list:
                try:
                    year = int(year)
                    return year
                except:
                    continue
        except:
            return ''

    def __find_km(self, code):
        try:
            km = self.__find_tag_by_text(code, text='Kilometers:')
            km = km.replace(",", "").replace(".", "").replace(" ", "")
            return int(km)
        except:
            return 0

    def __find_color(self, code):
        try:
            color = self.__find_tag_by_text(code, text='Color:')
            return color.strip()
        except:
            return ''

    def __find_specs(self, code):
        try:
            specs = self.__find_tag_by_text(code, text='Specs:')
            return specs.strip()
        except:
            return ''

    # ============= TRIM ===============
    # =====
    def __generateEditedTrims(self, marka, trim):
        for example_trim in self.trim_list:
            try:
                if len(example_trim['trim']) <= 3:
                    continue
            except:
                continue

            if '-' in example_trim['trim']:
                if example_trim['make'] == marka:

                    edited_example_trim = example_trim['trim'].replace(
                        '-', ' ')
                    if edited_example_trim in trim:
                        print example_trim['trim']
                        return example_trim['trim']

                    edited_example_trim = example_trim['trim'].replace(
                        '-', ' ').title()
                    if edited_example_trim in trim:
                        print example_trim['trim']
                        return example_trim['trim']
        return ''

    def __find_trim(self, code, marka, model):
        try:
            to_return_trim = ''
            not_edited_trim = self.__find_tag_by_text(code,
                                                      text='Model:').strip()
            trim = not_edited_trim.replace(model, '').strip()

            if len(trim.split()) == 0:
                print not_edited_trim, 'there is no Trim!!!!'
                return not_edited_trim.strip()

            for example_trim in self.trim_list:
                if example_trim['make'] == marka:
                    if example_trim['trim'] in trim:

                        if len(example_trim['trim']) <= 2:
                            if ' ' + example_trim[
                                    'trim'] + ' ' in ' ' + trim + ' ':
                                if len(example_trim['trim']) > len(
                                        to_return_trim):
                                    print example_trim['trim']
                                    to_return_trim = example_trim['trim']
                            continue

                        if len(example_trim['trim']) > len(to_return_trim):
                            print example_trim['trim']
                            to_return_trim = example_trim['trim']

            edited_trim = self.__generateEditedTrims(marka=marka, trim=trim)
            if len(edited_trim) > len(to_return_trim):
                return edited_trim
            elif to_return_trim == '':
                if len(trim.split()) <= 2 and len(trim.split()) > 0:
                    return trim
            else:
                return to_return_trim
        except:
            return ''

    # =====
    # ============= TRIM ===============

    def __find_model(self, code, make):
        try:
            breadcrumbs = code.findAll('span', {'typeof': 'v:Breadcrumb'})
            name = breadcrumbs[-1].text
            len_make = len(make.split())
            trim = name.split()[len_make:]
            trim = ' '.join(trim)
            return trim.strip()
        except Exception as exc:
            print exc
            return ''

    def __find_phone(self, code):
        try:
            phone = code.find('p', {
                'id': 'contact-buttons'
            }).find('a')['data-reveal']
            phone = phone.replace('"',
                                  "").replace(" ",
                                              "").replace("[",
                                                          "").replace("]", "")
            return phone.strip()
        except Exception as exc:
            print exc
            return ''

    def __find_price(self, code):
        try:
            price = code.find('strong', {'class': 'money'}).text
            price = price.replace('AED', "").replace(" ", "").\
                replace(",", "").\
                replace(".", "").\
                replace("-", "")
            return int(price)
        except:
            try:
                price = code.find('strong', {'class': 'money reduced'}).text
                price = price.replace('AED', "").replace(" ", ""). \
                    replace(",", ""). \
                    replace(".", ""). \
                    replace("-", "")
                return int(price)
            except:
                return 0

    def __find_tag_by_text(self, code, text):

        tag_with_text = code.find(text=text)
        needed_tag = tag_with_text.parent.find_next_sibling()
        return needed_tag.text
class DataExtractor:
    DOMAIN = 'dubai.dubizzle.com'
    PROJECT_ID = 13

    PATH = 'phones/'

    def __init__(self):
        print self.DOMAIN
        self.logger = Logger(name='dubizzle_data_log')
        self.err_logger = Logger(name='err_dubizzle_data_log')
        self.request_manager = RequestManager()
        self.source_code_manager = SourceCodeManager()
        self.generator = Generator()
        self.db = DatabaseManager()

    def extract_data(self, url_data):
        print url_data
        url_id = url_data['id']
        url = url_data['url']
        listing_id = url_data['listing_id']

        data = {}

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)

        expired = parsed_code.find('div', {'id': 'expired-ad-message'})
        if expired is not None:
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            self.err_logger.error("EXPIRED " + str(url_data))

            return
        elif response['status_code'] == 404:
            #self.db.remove_listing(listing_id)
            self.err_logger.error("404 " + str(url_data))

            self.db.set_url_inactive(url_id)

            return

        bread = parsed_code.find('span', {'id': 'browse_in_breadcrumb'})
        items = bread.findAll('div')
        try:
            year = parsed_code.find('img', attrs={
                'alt': 'Year'
            }).parent.text.replace('Year', '').strip()

            kilometres = parsed_code.find('img', attrs={
                'alt': 'Kilometers'
            }).parent.text.replace('Kilometers',
                                   '').strip().replace(',',
                                                       '').replace('.', '')
            color = parsed_code.find('img', attrs={
                'alt': 'Color'
            }).parent.text.replace('Color', '').strip()
            specs = parsed_code.find('img', attrs={
                'alt': 'Specs'
            }).parent.text.replace('Specs', '').strip()
            trim = parsed_code.find('img', attrs={
                'alt': 'Trim'
            }).parent.parent.text.replace('Trim', '').strip()
            if trim == 'Other':
                self.db.set_url_processed(url_id)
                return
            price = parsed_code.find('span', {
                'id': 'actualprice'
            }).text.replace(',', '').replace('.', '')
            model = items[-1].find('a').text.strip()
            marka = items[-2].find('a').text.strip()
            phone = self.extract_phone(parsed_code, id=url_id)
        except Exception as exc:
            self.err_logger.error(str(exc) + str(url_data))
            self.db.set_url_processed(url_id)

            return

        data['year'] = int(year)
        data['price'] = int(price)
        data['kilometres'] = int(kilometres)
        data['color'] = color
        data['specs'] = specs
        data['trim'] = trim
        data['model'] = model
        data['make'] = marka
        data['phone'] = phone

        self.db.insert_data(data=data,
                            listing_id=listing_id,
                            url=url,
                            source=self.DOMAIN)
        self.db.set_url_processed(url_id)

    def update_data(self, url_data):
        timestamp = generate_timestamp()
        url_id = url_data['id']
        listing_id = url_data['listing_id']
        print listing_id
        url = url_data['url']
        first_timestamp = url_data['timestamp']
        time_dif = first_timestamp - datetime.strptime(timestamp,
                                                       "%Y.%m.%d:%H:%M:%S")
        time_dif = time_dif.days

        response = self.request_manager.take_get_request(url)
        source_code = response['source_code']
        parsed_code = self.source_code_manager.parse_code(source_code)
        expired = parsed_code.find('div', {'id': 'expired-ad-message'})
        if expired is not None:
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            print "updated"

            return

        elif response['status_code'] == 404:
            print 404, listing_id
            self.db.set_sold_status(listing_id=listing_id,
                                    days_for_selling=time_dif)
            #self.db.remove_listing(listing_id)
            self.db.set_url_inactive(url_id)
            print "updated"

            return

        try:
            price = parsed_code.find('span', {
                'id': 'actualprice'
            }).text.replace(',', '').replace('.', '')
        except:
            price = 0

        # days = self.__calc_days_on_market(listing_id)

        self.db.update_listing(listing_id=listing_id,
                               price=int(price),
                               days_on_market=time_dif)
        self.db.set_updated(listing_id=listing_id)
        print "updated"

    # def __calc_days_on_market(self, listing_id):
    #     days_on_market = self.db.get_car_data(listing_id).days_on_market
    #     if days_on_market is None:
    #         return 0
    #     days_on_market += 1
    #     return days_on_market

    def extract_phone(self, code, id):
        img = code.find('img', {'class': 'phone-num-img'})['src']

        ext = img.partition('data:image/')[2].split(';')[0]
        with open(self.PATH + str(id) + '.' + ext, 'wb') as f:
            f.write(ba.a2b_base64(img.partition('base64,')[2]))

        text = textract.process(self.PATH + str(id) + '.' + ext).replace(
            ' ', '')

        if '+971' in text:
            pass
        else:
            text = '+971' + text

        os.remove(self.PATH + str(id) + '.' + ext)
        return text.strip()
Beispiel #4
0
 def __init__(self):
     self._request_manager = RequestManager()
Beispiel #5
0
class CommentScraper:
    def __init__(self):
        self._request_manager = RequestManager()

    def scrape_comments(self, url, sort_by):
        if url.startswith("https://www"):
            url = url.replace("www", "old", 1)

        soup = self._request_manager.get_reddit_soup(url)
        return self._parse_comments_from_document(soup)

    def _parse_comments_from_document(self, document, get_children=False):
        comment_objects_list = []

        try:
            container = document.find_all("div", class_=["nestedlisting"])[0]
        except IndexError:
            return comment_objects_list

        container_comments = container.find_all("div", class_="comment")
        if (get_children and len(container_comments) == 1):
            return comment_objects_list

        first_comment = (container.find_all(
            "div", class_="comment")[1 if get_children else 0])

        comment_objects_list.append(self._extract_comment_data(first_comment))

        for sibling in first_comment.next_siblings:
            is_tag = isinstance(sibling, Tag)
            is_comment = "comment" in sibling["class"]
            is_morechildren = "morechildren" in sibling["class"]

            if is_tag and is_comment:
                comment_objects_list.append(
                    self._extract_comment_data(sibling))
            elif is_tag and is_morechildren:
                subreddit = (document.find(
                    "link", {"rel": "canonical"})["href"].split("/")[4])

                comment_objects_list.extend(
                    self._get_more_comments(sibling, subreddit))

        return comment_objects_list

    def _extract_comment_data(self, comment_tag, recursive=True):
        top_level_comment_object = {}

        score_tag = comment_tag.find("span", class_="score unvoted")
        score = score_tag["title"] if score_tag is not None else "???"
        author_tag = comment_tag.find("a", class_="author")
        author = (author_tag.text.strip()
                  if author_tag is not None else "[deleted]")

        date_posted = comment_tag.find("time", class_="live-timestamp")
        date_posted_timestamp = date_posted["datetime"]
        date_posted_readable = date_posted["title"]

        date_edited = comment_tag.find("time", class_="edited-timestamp")
        date_edited_timestamp = (date_edited["datetime"]
                                 if date_edited is not None else None)

        num_children = int(
            comment_tag.find("a", class_="numchildren").text.strip().replace(
                "(", "").replace(")", "").split(" ")[0])

        permalink_old = comment_tag.find("a", class_="bylink")["href"]
        permalink = permalink_old.replace("old", "www", 1)

        comment_container = comment_tag.find(
            "div", class_="usertext-body may-blank-within md-container").find(
                "div", class_="md")

        comment_formatted = comment_container.prettify()
        comment_raw = " ".join([
            p.text for p in comment_container.find_all("p")
        ]).strip().rstrip()

        top_level_comment_object["score"] = score
        top_level_comment_object["author"] = author
        top_level_comment_object["date_posted_timestamp"] = (
            date_posted_timestamp)
        top_level_comment_object["date_posted_readable"] = date_posted_readable
        top_level_comment_object["date_edited_timestamp"] = (
            date_edited_timestamp)
        top_level_comment_object["num_children"] = num_children
        top_level_comment_object["permalink_old"] = permalink_old
        top_level_comment_object["permalink"] = permalink
        top_level_comment_object["comment_formatted"] = comment_formatted
        top_level_comment_object["comment_raw"] = comment_raw

        if num_children == 0 or not recursive:
            return top_level_comment_object
        else:
            nested_soup = self._request_manager.get_reddit_soup(permalink_old)
            parsed_replies = self._parse_comments_from_document(
                nested_soup, True)

            if len(parsed_replies) == 0:
                return top_level_comment_object

            top_level_comment_object["replies"] = parsed_replies

            return top_level_comment_object

    def _get_more_comments(self, morecomment_tag, subreddit):
        morecomments_args = (morecomment_tag.a["onclick"].replace(
            "return morechildren",
            "").replace("(", "").replace(")", "").replace("'", "").split(","))

        data_id = morecomment_tag["data-fullname"]
        link_id = morecomments_args[1].strip()
        sort = morecomments_args[2].strip()
        renderstyle = "html"
        limit_children = False
        r = subreddit
        children = (",".join(morecomments_args[3:len(morecomments_args) -
                                               1]).strip())

        payload = {
            "id": data_id,
            "link_id": link_id,
            "sort": sort,
            "renderstyle": renderstyle,
            "limit_children": limit_children,
            "r": r,
            "children": children
        }

        more_soup = self._request_manager.post_reddit_soup(
            "https://old.reddit.com/api/morechildren", payload)

        json_comments = json.loads(more_soup.prettify())
        json_comments_list = json_comments["jquery"][10][3][0]

        more_comments = []
        for comment in json_comments_list:
            comment_content = comment["data"]["content"]
            comment_tag_string = html.unescape(comment_content)
            comment_tag_soup = BeautifulSoup(comment_tag_string, "html.parser")
            if comment["kind"] == "more":
                more_comments.extend(
                    self._get_more_comments(
                        comment_tag_soup.find("div", class_="morechildren"),
                        subreddit))
            else:
                more_comments.append(
                    self._extract_comment_data(comment_tag_soup))

        return more_comments
 def __init__(self):
     self.request_manager = RequestManager()
     self.source_code_manager = SourceCodeManager()
     self.generator = Generator()
     self.db = DatabaseManager()
Beispiel #7
0
class PostScraper:
    def __init__(self):
        self._request_manager = RequestManager()

    def _get_posts_from_first_soup(self, first_soup, limit):
        try:
            script_data = first_soup.select('script#data')
            script_data_content = json.dumps(script_data[0].contents[0])
            script_data_content = (script_data_content.replace(
                "window.___r = ", ""))
            script_data_content = json.loads(script_data_content)
            script_data_content_len = len(script_data_content)
            script_data_content = (
                script_data_content[:script_data_content_len - 1])

            script_data_dictionary = json.loads(script_data_content)
            script_data_list = list(
                script_data_dictionary["posts"]["models"].values())
            filtered_list = [
                post for post in script_data_list
                if post["belongsTo"]["type"] == "subreddit" and
                not post['isStickied'] and post['crosspostParentId'] is None
            ]

            if (len(filtered_list) < limit):
                return filtered_list
            return filtered_list[:limit]
        except IndexError:
            return []

    def _get_posts_after_first_soup(self, soup, limit):
        post_list = list(json.loads(soup.text)["posts"].values())
        filtered_list = [
            post for post in post_list
            if post["belongsTo"]["type"] == "subreddit"
            and not post["isStickied"] and post["crosspostParentId"] is None
        ]

        if (len(filtered_list) < limit):
            return filtered_list

        return filtered_list[:limit]

    def _get_processed_posts(self, posts, return_keys=[], verbose=False):
        post_objects = []
        post_ids = []

        for value in posts:
            post_object = {}
            if (len(return_keys) > 0):
                for return_key in return_keys:
                    post_object[return_key] = value[return_key]
            elif (verbose):
                post_object = value
            else:
                post_object["id"] = value["id"]
                post_object["title"] = value["title"]
                post_object["numComments"] = value["numComments"]
                post_object["created"] = value["created"]
                post_object["score"] = value["score"]
                post_object["author"] = value["author"]
                post_object["upvoteRatio"] = value["upvoteRatio"]
                post_object["permalink"] = value["permalink"]
                post_object["media"] = value["media"]

            post_ids.append(value["id"])
            post_objects.append(post_object)

        return post_objects, post_ids

    def scrape_posts(self, subreddit, limit, sort_by, verbose):
        post_objects_list = []
        post_ids_list = []
        posts_count = 0

        subreddit_entered = subreddit is not None and len(subreddit) > 0

        while (posts_count < limit):
            if (posts_count == 0):
                url = BASE_URL
                if (subreddit_entered):
                    url += "/r/{sub_name}"
                    url = url.format(sub_name=subreddit)
                url += "/{sort_by}"
                url = url.format(sort_by=sort_by)

                subreddit_post_soup = self._request_manager.get_reddit_soup(
                    url)
                posts = self._get_posts_from_first_soup(
                    subreddit_post_soup, limit)
            else:
                remaining_limit = limit - posts_count
                if (subreddit_entered):
                    url = URL_AFTER_ID.format(sub_name=subreddit,
                                              last_id=post_ids_list[-1],
                                              sort_by=sort_by)
                else:
                    url = BASE_URL + "/{sort_by}/?after={last_id}"
                    url = url.format(sort_by=sort_by,
                                     last_id=post_ids_list[-1])

                subreddit_post_soup = self._request_manager.get_reddit_soup(
                    url)

                posts = (self._get_posts_after_first_soup(
                    subreddit_post_soup, remaining_limit) if
                         (subreddit_entered) else
                         self._get_posts_from_first_soup(
                             subreddit_post_soup, remaining_limit))

            post_objects, post_ids = self._get_processed_posts(posts,
                                                               return_keys=[],
                                                               verbose=verbose)
            post_objects_list.extend(post_objects)
            post_ids_list.extend(post_ids)
            posts_count = len(post_objects_list)

            if (posts_count == 0):
                break

        return post_objects_list
Beispiel #8
0
class LinksExtractor:

    DOMAIN = 'dubicars.com'
    PROJECT_ID = 13

    def __init__(self):
        self.request_manager = RequestManager()
        self.source_code_manager = SourceCodeManager()
        self.generator = Generator()
        self.db = DatabaseManager()

    def __createUrl(self, templateUrl, page):
        #url = templateUrl[:-1] + str(page)
        url = templateUrl.format(page)
        return url

    def findLinks(self, sourceCode):
        links = []
        status = True
        sourceCode = sourceCode.find('section', {'data-item-hash': "search"})
        listOfTags = sourceCode.findAll('li')
        for block in listOfTags:
            try:
                data = block['data-sp-item']
            except:
                continue
            data = json.loads(data)
            listing_id = data['id']
            try:
                km = int(data['km'])
            except:
                km = 101

            if km < 100:
                continue

            tag_a = block.find('a')
            href = tag_a['href']
            links.append({'url': href, 'listing_id': listing_id})
        return {'links': links, 'status': status}

    def main(self, sourceUrl):
        page = 1
        while True:
            url = self.__createUrl(sourceUrl, page)
            print url
            try:
                response = self.request_manager.take_get_request(
                    url, proxy_using=False)
            except Exception as exc:
                print exc
                break
            parseSourceCode = self.source_code_manager.parse_code(
                response['source_code'])

            links_data = self.findLinks(parseSourceCode)
            links = links_data['links']

            self.db.insert_urls(urls_list=links, source=self.DOMAIN)

            if self.isLastPage(parseSourceCode):
                print "last"
                break
            page += 1

    def find_last_page(self, code):
        pagination = code.find('div', {'class': 'paging '})
        pages = pagination.findAll('a')
        last_page = int(pages[-2].text)
        print last_page
        return last_page

    def isLastPage(self, code):
        next_page = code.find('a', {'class': 'next'})
        if next_page is None:
            return True
        else:
            return False