Beispiel #1
0
    def populate(self):
        print('making scraper')
        cl_results_scraper = CraigslistSoupMaker()
        print('making soup')
        results = cl_results_scraper.make_soup()

        # pp.pprint(results)

        conn = db.connect(dbname=POSTGRESQL_DBNAME,
                          user=POSTGRESQL_USERNAME,
                          password=POSTGRESQL_PASSWORD,
                          host=POSTGRESQL_HOST,
                          port=POSTGRESQL_PORT)
        cur = conn.cursor()

        for board in [
                board for board in board_site_enums.board_sites
                if board['is_supported'] == True
        ]:
            cur.execute('SELECT id FROM boards WHERE name = %s',
                        (board['name'], ))
            board_id = cur.fetchone()

            for result in results:
                cur.execute(
                    """SELECT id, native_id, title FROM listings WHERE native_id = %s""",
                    (result['id'], ))
                existing_post_id = cur.fetchone()
                if existing_post_id is None:
                    post_soup_maker = CraigslistPostSoupMaker()
                    post_soup_maker.set_url(result['url'])
                    post_soup = post_soup_maker.make_soup()
                    post_data = CraigslistPostScraper(post_soup)

                    listing = Listing(id=None,
                                      native_id=result['id'],
                                      title=result['title_massaged'],
                                      body=post_data.data['body'],
                                      price=None,
                                      url=result['url'],
                                      date_posted=result['datetime'],
                                      date_scraped=None)

                    # TODO: move me to listing instance method
                    cur.execute(
                        """INSERT INTO listings (board_id, native_id, url, title, body, price, date_posted) VALUES(%s, %s, %s, %s, %s, %s, %s) RETURNING id, title;""",
                        (board_id, listing.native_id, listing.url,
                         listing.title, listing.body, listing.price,
                         listing.date_posted))
                    conn.commit()
                    temp = cur.fetchone()
                    print(temp)
                else:
                    print('dupe listing; skipping')
                    print(existing_post_id)
        cur.close()
Beispiel #2
0
    def pull_listings():
        cur = conn.cursor()
        cur.execute(
            """SELECT id, native_id, title, body, price, url, date_posted, date_scraped, board_id FROM listings WHERE is_scanned = FALSE AND date_posted >= ((now() AT TIME ZONE 'utc') - interval '5 day');"""
        )
        raw_listings = cur.fetchall()

        for raw_l in raw_listings:
            listing = Listing(id=raw_l[0],
                              native_id=raw_l[1],
                              title=raw_l[2],
                              body=raw_l[3],
                              price=raw_l[4],
                              url=raw_l[5],
                              date_posted=raw_l[6],
                              date_scraped=raw_l[7])
            listing.add_to_registry()

        cur.close()
Beispiel #3
0
    def find_matches(cls):
        """
		Find all matches inside yet unsearched listings
		"""
        # find product instances in listings
        profiler = Profiler()

        for listing in Listing.get_all():
            for edition in PlatformEdition.get_all():
                cls.search_for_matches(listing, edition)
            for platform in Platform.get_all():
                cls.search_for_matches(listing, platform)
Beispiel #4
0
    def pull_platform_edition_presences():
        """
		Returns product presences, organized in dict (keys are pe_id; values are listing_id)
		"""
        cur = conn.cursor()
        cur.execute(
            """SELECT listing_id, platform_edition_id, is_matched_via_body_text, index_start, index_end, score FROM listings_platform_editions;"""
        )
        raw_pe_presences = cur.fetchall()
        cur.close()

        pe_presences = []

        for presence in raw_pe_presences:
            pe_presences.append({
                'listing_id': presence[0],
                'platform_edition_id': presence[1],
                'is_matched_via_body_text': presence[2],
                'index_start': presence[3],
                'index_end': presence[4],
                'score': presence[5]
            })
        del raw_pe_presences

        pe_presences_per_pe = {}

        for presence in pe_presences:
            if presence['platform_edition_id'] not in pe_presences_per_pe:
                pe_presences_per_pe[presence['platform_edition_id']] = list()
            pe_presences_per_pe[presence['platform_edition_id']].append(
                presence['listing_id'])

            match = Match(
                score=presence['score'],
                is_matched_via_body_text=presence['is_matched_via_body_text'],
                start=presence['index_start'],
                end=presence['index_end'],
                item=PlatformEdition.get_by_id(
                    presence['platform_edition_id']),
                listing=Listing.get_by_id(presence['listing_id']))
            match.add_to_registry()

        print('inner Match')
        print(Match)

        del pe_presences

        return pe_presences_per_pe
    def generate_message_text(self):
        message_text_matches = ''
        message_text_matches = message_text_matches + 'PLATFORMS & EDITIONS\n\n'

        pp.pprint(Listing.registry)
        for platform in Platform.get_all():
            if platform.id in self.platform_editions or (
                    self.platforms and platform.id in self.platforms):
                message_text_per_platform = ''

                # ----- Platform Name -----
                message_text_per_platform = message_text_per_platform + '----- ' + platform.name + ' -----\n'

                if self.platform_editions.get(platform.id):
                    for edition_id in self.platform_editions.get(platform.id):
                        edition = PlatformEdition.get_by_id(edition_id)

                        edition_referencial_name = edition.referencial_name()

                        message_text_per_platform = message_text_per_platform + edition_referencial_name + '\n'

                        if Mailer.pe_presences_per_pe.get(edition.id):
                            for listing_id in Mailer.pe_presences_per_pe.get(
                                    edition.id):
                                listing = Listing.get_by_id(listing_id)
                                # listing title
                                message_text_per_platform = message_text_per_platform + '\t' + listing.title + '\n'

                                # listing price
                                if listing.price is None:
                                    message_text_per_platform = message_text_per_platform + '\t' + '(price not listed)' + '\n'
                                else:
                                    message_text_per_platform = message_text_per_platform + '\t' + listing.price + '\n'

                                # listing link
                                message_text_per_platform = message_text_per_platform + '\t' + listing.url + '\n'

                                # listing datetime
                                message_text_per_platform = message_text_per_platform + '\t' + str(
                                    listing.date_posted) + '\n'

                                # blank line
                                message_text_per_platform = message_text_per_platform + '\t' + '\n'
                    message_text_per_platform = message_text_per_platform + '\n'

                message_text_matches = message_text_matches + message_text_per_platform
        return message_text_matches
    def generate_message_html(self):
        message_text_matches = ''
        products_matched_ct = 0

        platforms_and_editions_category_title = '<h2>PLATFORMS & EDITIONS</h2>\n\n'
        message_text_all_platforms = ''

        for platform in Platform.get_all():
            message_text_this_platform = ''

            if platform.id in self.platform_editions or (
                    self.platforms and platform.id in self.platforms):
                # Platform Name
                this_platform_name = f'\n<h3>{platform.name}</h3>'

                message_text_this_platform_general = ''
                message_text_this_platform_list = ''

                platform_general_list_start = '\n<ul style="padding: 0; list-style: none;">'
                if self.platform_presences_per_platform.get(platform.id):
                    listing_ct = 0
                    for listing_id in self.platform_presences_per_platform.get(
                            platform.id):
                        listing = Listing.get_by_id(listing_id)
                        if listing is not None:
                            listing_ct += 1
                            match = Match.get_by_info(listing, platform)
                            message_text_this_platform_list = message_text_this_platform_list + Mailer.create_listing_html(
                                match, listing_ct)
                platform_general_list_end = '\n</ul>'

                if len(message_text_this_platform_list) > 0:
                    message_text_this_platform_general = message_text_this_platform_general + f"""{platform_general_list_start}\n{message_text_this_platform_list}\n{platform_general_list_end}"""

                message_text_all_editions = ''

                if self.platform_editions.get(platform.id):
                    for edition_id in self.platform_editions.get(platform.id):
                        edition = PlatformEdition.get_by_id(edition_id)

                        edition_referencial_name = edition.referencial_name()

                        message_text_this_edition = ''

                        this_edition_name = f'\n<h4>{edition_referencial_name}</h4>'

                        message_text_this_edition_listings = ''

                        if Mailer.pe_presences_per_pe.get(edition.id):
                            listing_ct = 0
                            editions_list_start = '\n<ul style="padding: 0; list-style: none;">'

                            for listing_id in Mailer.pe_presences_per_pe.get(
                                    edition.id):
                                listing = Listing.get_by_id(listing_id)

                                if listing is not None:
                                    listing_ct += 1
                                    match = Match.get_by_info(listing, edition)
                                    message_text_this_edition_listings = message_text_this_edition_listings + Mailer.create_listing_html(
                                        match, listing_ct)

                            editions_list_end = '\n</ul>'

                        if len(message_text_this_edition_listings) > 0:
                            products_matched_ct += 1
                            # add edition title
                            message_text_this_edition = message_text_this_edition + this_edition_name
                            # add edition listings
                            message_text_this_edition = message_text_this_edition + editions_list_start + message_text_this_edition_listings + editions_list_end

                    if len(message_text_this_edition) > 0:
                        # add edition message text
                        message_text_all_editions = message_text_all_editions + message_text_this_edition

                if len(message_text_all_editions) > 0 or len(
                        message_text_this_platform_general) > 0:
                    # add platform title
                    message_text_this_platform = message_text_this_platform + this_platform_name
                    if len(message_text_this_platform_general) > 0:
                        products_matched_ct += 1
                        general_platform_matches_title = f'\n<h4>General</h4>'
                        # add platform generic message text
                        message_text_this_platform = message_text_this_platform + general_platform_matches_title + message_text_this_platform_general
                    if len(message_text_all_editions) > 0:
                        # add platform editions message text
                        message_text_this_platform = message_text_this_platform + message_text_all_editions

            if len(message_text_this_platform) > 0:
                # add platform editions message text
                message_text_all_platforms = message_text_all_platforms + message_text_this_platform

        if len(message_text_all_platforms) > 0:
            # add platform & edition category title to mail message
            message_text_matches = message_text_matches + platforms_and_editions_category_title
            # add platform & edition category content to mail message
            message_text_matches = message_text_matches + message_text_all_platforms

        return message_text_matches, products_matched_ct