def pull_platform_editions(): cur = conn.cursor() cur.execute( """SELECT pe.id AS id, pe.name AS name, pe.official_color AS official_color, pe.has_matte AS has_matte, pe.has_transparency AS has_transparency, pe.has_gloss AS has_gloss, pe.note AS note, pe.image_url AS image_url, x.colors, p.id AS platform_id FROM platforms AS p JOIN platform_editions AS pe ON pe.platform_id = p.id JOIN (SELECT pe.id AS id, STRING_AGG(c.name,', ') AS colors FROM platform_editions AS pe JOIN colors_platform_editions AS cpe ON cpe.platform_edition_id = pe.id JOIN colors AS c ON c.id = cpe.color_id GROUP BY pe.id ORDER BY pe.id) AS x ON x.id = pe.id ORDER BY p.id, name, official_color; """) raw_platform_editions = cur.fetchall() for raw_pe in raw_platform_editions: pe = PlatformEdition(id=raw_pe[0], name=raw_pe[1], official_color=raw_pe[2], has_matte=raw_pe[3], has_transparency=raw_pe[4], has_gloss=raw_pe[5], note=raw_pe[6], image_url=raw_pe[7]) for color in raw_pe[8].split(', '): pe.colors.append(color) # put edition to platform p_id = raw_pe[9] Platform.get_by_id(p_id).add_edition(pe) pe.add_to_registry() cur.close()
def get_platform_by_id(id): conn = get_db_connection() cur = conn.cursor() cur.execute(""" SELECT p.id as id, p.name as name, p.is_brand_missing as is_brand_missing, pf.id as platform_family_id, pf.name as platform_family_name, p.model_no as model_no, p.storage_capacity as storage_capacity, p.description as description, p.disambiguation as disambiguation, p.relevance as relevance FROM platforms as p JOIN platform_families as pf ON pf.id = p.platform_family_id WHERE p.id=%s LIMIT 1; """, (id,)) p = cur.fetchone() cur.close() conn.close() if p == None: return None return Platform(id=p[0], name=p[1], is_brand_missing_from_name=p[2], platform_family_id=p[3], platform_family_name=p[4], model_no=p[5], storage_capacity=p[6], description=p[7], disambiguation=p[8], relevance=p[9])
def get_all_platforms(): conn = get_db_connection() cur = conn.cursor() cur.execute(""" SELECT p.id as id, p.name as name, p.is_brand_missing as is_brand_missing, pf.id as platform_family_id, pf.name as platform_family_name, p.model_no as model_no, p.storage_capacity as storage_capacity, p.description as description, p.disambiguation as disambiguation, p.relevance as relevance FROM platforms as p JOIN platform_families as pf ON pf.id = p.platform_family_id ORDER BY p.relevance DESC; """) platforms = cur.fetchall() all_platforms = [] for p in platforms: current = Platform(id=p[0], name=p[1], is_brand_missing_from_name=p[2], platform_family_id=p[3], platform_family_name=p[4], model_no=p[5], storage_capacity=p[6], description=p[7], disambiguation=p[8], relevance=p[9]) all_platforms.append(current) cur.close() conn.close() return all_platforms
def find_matches(cls): """ Find all matches inside yet unsearched listings """ # find product instances in listings profiler = Profiler() for listing in Listing.get_all(): for edition in PlatformEdition.get_all(): cls.search_for_matches(listing, edition) for platform in Platform.get_all(): cls.search_for_matches(listing, platform)
def pull_platforms(): cur = conn.cursor() cur.execute( """SELECT p.id, p.name, p.is_brand_missing, p.platform_family_id, pf.name as platform_family_name, p.model_no, p.storage_capacity, p.description, p.disambiguation, p.relevance FROM platforms as p JOIN platform_families as pf ON pf.id = p.platform_family_id LEFT JOIN platform_name_groups as png ON png.id = p.name_group_id;""" ) raw_ps = cur.fetchall() for raw_p in raw_ps: p = Platform(id=raw_p[0], name=raw_p[1], is_brand_missing_from_name=raw_p[2], platform_family_id=raw_p[3], platform_family_name=raw_p[4], model_no=raw_p[5], storage_capacity=raw_p[6], description=raw_p[7], disambiguation=raw_p[8], relevance=raw_p[9]) p.add_to_registry() cur.close()
def generate_message_text(self): message_text_matches = '' message_text_matches = message_text_matches + 'PLATFORMS & EDITIONS\n\n' pp.pprint(Listing.registry) for platform in Platform.get_all(): if platform.id in self.platform_editions or ( self.platforms and platform.id in self.platforms): message_text_per_platform = '' # ----- Platform Name ----- message_text_per_platform = message_text_per_platform + '----- ' + platform.name + ' -----\n' if self.platform_editions.get(platform.id): for edition_id in self.platform_editions.get(platform.id): edition = PlatformEdition.get_by_id(edition_id) edition_referencial_name = edition.referencial_name() message_text_per_platform = message_text_per_platform + edition_referencial_name + '\n' if Mailer.pe_presences_per_pe.get(edition.id): for listing_id in Mailer.pe_presences_per_pe.get( edition.id): listing = Listing.get_by_id(listing_id) # listing title message_text_per_platform = message_text_per_platform + '\t' + listing.title + '\n' # listing price if listing.price is None: message_text_per_platform = message_text_per_platform + '\t' + '(price not listed)' + '\n' else: message_text_per_platform = message_text_per_platform + '\t' + listing.price + '\n' # listing link message_text_per_platform = message_text_per_platform + '\t' + listing.url + '\n' # listing datetime message_text_per_platform = message_text_per_platform + '\t' + str( listing.date_posted) + '\n' # blank line message_text_per_platform = message_text_per_platform + '\t' + '\n' message_text_per_platform = message_text_per_platform + '\n' message_text_matches = message_text_matches + message_text_per_platform return message_text_matches
def pull_platform_presences(): """ Returns product presences, organized in dict (keys are p_id; values are listing_id) """ cur = conn.cursor() cur.execute( """SELECT listing_id, platform_id, is_matched_via_body_text, index_start, index_end, score FROM listings_platforms;""" ) raw_platform_presences = cur.fetchall() cur.close() platform_presences = [] for presence in raw_platform_presences: platform_presences.append({ 'listing_id': presence[0], 'platform_id': presence[1], 'is_matched_via_body_text': presence[2], 'index_start': presence[3], 'index_end': presence[4], 'score': presence[5] }) del raw_platform_presences platform_presences_per_pe = {} for presence in platform_presences: if presence['platform_id'] not in platform_presences_per_pe: platform_presences_per_pe[presence['platform_id']] = list() platform_presences_per_pe[presence['platform_id']].append( presence['listing_id']) match = Match( score=presence['score'], is_matched_via_body_text=presence['is_matched_via_body_text'], start=presence['index_start'], end=presence['index_end'], item=Platform.get_by_id(presence['platform_id']), listing=Listing.get_by_id(presence['listing_id'])) match.add_to_registry() del platform_presences return platform_presences_per_pe
def get_searched_platforms(q=''): conn = get_db_connection() cur = conn.cursor() like_q = '%' + q + '%' data_dict = { 'q': q, 'like_q': like_q, } cur.execute(""" SELECT p.id as id, p.name as name, p.is_brand_missing as is_brand_missing, pf.id as platform_family_id, pf.name as platform_family_name, p.model_no as model_no, p.storage_capacity as storage_capacity, p.description as description, p.disambiguation as disambiguation, p.relevance as relevance FROM platforms as p JOIN platform_families as pf ON pf.id = p.platform_family_id WHERE p.name ILIKE %(like_q)s OR p.model_no ILIKE %(like_q)s ORDER BY p.relevance DESC; """, data_dict) platforms = cur.fetchall() all_platforms = [] for p in platforms: current = Platform(id=p[0], name=p[1], is_brand_missing_from_name=p[2], platform_family_id=p[3], platform_family_name=p[4], model_no=p[5], storage_capacity=p[6], description=p[7], disambiguation=p[8], relevance=p[9]) all_platforms.append(current) cur.close() conn.close() return all_platforms
def search_for_platform_edition_matches(cls, listing, edn): platform = Platform.get_by_edition_id(edn.id) hottexts = { 'exact': list(), 'strong': list(), 'weak': list(), 'minor': list() } if len(edn.colors) > 0 and platform.name and not edn.name: hottexts['strong'].append( f"""{', '.join(edn.colors)} {platform.name}""") hottexts['strong'].append( f"""{platform.name} {', '.join(edn.colors)}""") if len(edn.colors) > 0 and platform.name and edn.name: hottexts['weak'].append( f"""{', '.join(edn.colors)} {platform.name}""") hottexts['weak'].append( f"""{platform.name} {', '.join(edn.colors)}""") hottexts['exact'].append( f"""{edn.name} {platform.name} {', '.join(edn.colors)}""") hottexts['exact'].append( f"""{edn.name} {', '.join(edn.colors)} {platform.name}""") hottexts['exact'].append( f"""{platform.name} {edn.name} {', '.join(edn.colors)}""") hottexts['exact'].append( f"""{platform.name} {', '.join(edn.colors)} {edn.name}""") hottexts['exact'].append( f"""{', '.join(edn.colors)} {edn.name} {platform.name}""") hottexts['exact'].append( f"""{', '.join(edn.colors)} {platform.name} {edn.name}""") if len(edn.colors) > 0 and platform.name and edn.has_matte: hottexts['strong'].append( f"""matte {', '.join(edn.colors)} {platform.name}""") hottexts['strong'].append( f"""matte {' & '.join(edn.colors)} {platform.name}""") if len(edn.colors) > 0 and platform.name and edn.has_transparency: hottexts['strong'].append( f"""transparent {', '.join(edn.colors)} {platform.name}""") hottexts['strong'].append( f"""transparent {' & '.join(edn.colors)} {platform.name}""") if len(edn.colors) > 0 and platform.name and edn.has_gloss: hottexts['strong'].append( f"""glossy {', '.join(edn.colors)} {platform.name}""") hottexts['strong'].append( f"""glossy {' & '.join(edn.colors)} {platform.name}""") if len(edn.colors) > 0 and platform.name and platform.model_no: hottexts['exact'].append( f"""{', '.join(edn.colors)} {platform.model_no} {platform.name}""" ) hottexts['exact'].append( f"""{platform.model_no} {', '.join(edn.colors)} {platform.name}""" ) hottexts['exact'].append( f"""{', '.join(edn.colors)} {platform.name} {platform.model_no}""" ) if edn.name and edn.official_color: hottexts['strong'].append(f"""{edn.name} {edn.official_color}""") hottexts['strong'].append(f"""{edn.official_color} {edn.name}""") if edn.name and platform.name: hottexts['strong'].append(f"""{edn.name} {platform.name}""") hottexts['strong'].append(f"""{platform.name} {edn.name}""") if edn.official_color and platform.name: hottexts['strong'].append( f"""{platform.name} {edn.official_color}""") hottexts['strong'].append( f"""{edn.official_color} {platform.name}""") if edn.name and (not edn.official_color and not platform.model_no): hottexts['strong'].append(f"""{edn.name}""") if edn.name and (edn.official_color or platform.model_no): hottexts['weak'].append(f"""{edn.name}""") if edn.official_color and edn.official_color not in edn.colors: hottexts['minor'].append(f"""{edn.official_color}""") # pprint.pprint(hottexts) # TODO: split up function so components are testable for degree in hottexts.keys(): for hottext in hottexts[degree]: for text in [listing.title, listing.body]: search_start_index = 0 while search_start_index < len(text): # print(edn.id, search_start_index, len(text)) # TODO: add brand # organize two lists? # one for base search texts # one for corresponding lists of anti-match search protocols try: match_index = text[search_start_index:].index( hottext) # print('FOUND ' + hottext + ' @ ' + str(match_index + search_start_index)) # print(text[search_start_index+match_index:search_start_index+match_index+len(hottext)]) is_matched_via_body_text = True if text == listing.body else False match = Match( score=cls.MATCH_MULTIPLIERS[degree], is_matched_via_body_text= is_matched_via_body_text, start=match_index + search_start_index, end=search_start_index + match_index + len(hottext), item=edn, listing=listing) if match.score > cls.MATCH_SCORE_THRESHOLD: match.add_to_registry() search_start_index = match_index + search_start_index + 1 except Exception: search_start_index = len(text)
def generate_message_html(self): message_text_matches = '' products_matched_ct = 0 platforms_and_editions_category_title = '<h2>PLATFORMS & EDITIONS</h2>\n\n' message_text_all_platforms = '' for platform in Platform.get_all(): message_text_this_platform = '' if platform.id in self.platform_editions or ( self.platforms and platform.id in self.platforms): # Platform Name this_platform_name = f'\n<h3>{platform.name}</h3>' message_text_this_platform_general = '' message_text_this_platform_list = '' platform_general_list_start = '\n<ul style="padding: 0; list-style: none;">' if self.platform_presences_per_platform.get(platform.id): listing_ct = 0 for listing_id in self.platform_presences_per_platform.get( platform.id): listing = Listing.get_by_id(listing_id) if listing is not None: listing_ct += 1 match = Match.get_by_info(listing, platform) message_text_this_platform_list = message_text_this_platform_list + Mailer.create_listing_html( match, listing_ct) platform_general_list_end = '\n</ul>' if len(message_text_this_platform_list) > 0: message_text_this_platform_general = message_text_this_platform_general + f"""{platform_general_list_start}\n{message_text_this_platform_list}\n{platform_general_list_end}""" message_text_all_editions = '' if self.platform_editions.get(platform.id): for edition_id in self.platform_editions.get(platform.id): edition = PlatformEdition.get_by_id(edition_id) edition_referencial_name = edition.referencial_name() message_text_this_edition = '' this_edition_name = f'\n<h4>{edition_referencial_name}</h4>' message_text_this_edition_listings = '' if Mailer.pe_presences_per_pe.get(edition.id): listing_ct = 0 editions_list_start = '\n<ul style="padding: 0; list-style: none;">' for listing_id in Mailer.pe_presences_per_pe.get( edition.id): listing = Listing.get_by_id(listing_id) if listing is not None: listing_ct += 1 match = Match.get_by_info(listing, edition) message_text_this_edition_listings = message_text_this_edition_listings + Mailer.create_listing_html( match, listing_ct) editions_list_end = '\n</ul>' if len(message_text_this_edition_listings) > 0: products_matched_ct += 1 # add edition title message_text_this_edition = message_text_this_edition + this_edition_name # add edition listings message_text_this_edition = message_text_this_edition + editions_list_start + message_text_this_edition_listings + editions_list_end if len(message_text_this_edition) > 0: # add edition message text message_text_all_editions = message_text_all_editions + message_text_this_edition if len(message_text_all_editions) > 0 or len( message_text_this_platform_general) > 0: # add platform title message_text_this_platform = message_text_this_platform + this_platform_name if len(message_text_this_platform_general) > 0: products_matched_ct += 1 general_platform_matches_title = f'\n<h4>General</h4>' # add platform generic message text message_text_this_platform = message_text_this_platform + general_platform_matches_title + message_text_this_platform_general if len(message_text_all_editions) > 0: # add platform editions message text message_text_this_platform = message_text_this_platform + message_text_all_editions if len(message_text_this_platform) > 0: # add platform editions message text message_text_all_platforms = message_text_all_platforms + message_text_this_platform if len(message_text_all_platforms) > 0: # add platform & edition category title to mail message message_text_matches = message_text_matches + platforms_and_editions_category_title # add platform & edition category content to mail message message_text_matches = message_text_matches + message_text_all_platforms return message_text_matches, products_matched_ct
def build_string_matches(self, item): # print('building item match profile for ' + str(item) + '...') match_strings = {'exact': [], 'strong': [], 'weak': [], 'minor': []} if type(item).__name__ == 'Platform': print('making match strings for ' + str(item)) if type(item).__name__ == 'PlatformEdition': platform = Platform.get_by_edition_id(item.id) is_platform_family_namesake = True if item.name == platform.platform_family_name else False # FIXME: add developer, alternate_names for robust matches # positive exact matches if platform.model_no: match_strings['exact'].append(platform.model_no) if item.official_color and item.name: if not is_platform_family_namesake: match_strings['exact'].append( item.official_color + ' ' + item.name + ' ' + platform.platform_family_name) match_strings['exact'].append(item.official_color + ' ' + item.name + ' ' + item.name) match_strings['exact'].append(item.name + ' ' + item.official_color + ' ' + item.name) match_strings['exact'].append(item.name + ' ' + item.name + ' ' + item.official_color) match_strings['exact'].append(item.official_color + ' ' + item.name) match_strings['exact'].append(item.name + ' ' + item.official_color) elif item.name: match_strings['exact'].append(item.name + ' ' + item.name) # positive strong matches # FIXME: check that platform name isn't the generic name contained within other similar platform names like "3DS" is in "New 3DS", "3DS XL", "New 3DS XL", etc. if not is_platform_family_namesake and item.name: match_strings['strong'].append(item.name) for color in item.colors: if item.name and not is_platform_family_namesake: match_strings['strong'].append( item.name + ' ' + platform.platform_family_name) if item.name: match_strings['strong'].append(item.name + ' ' + item.name) if not item.official_color or color != item.official_color.lower( ): match_strings['strong'].append(color + ' ' + item.name) if item.official_color and item.name: match_strings['strong'].append(item.official_color + ' ' + item.name) if item.official_color and not is_platform_family_namesake: match_strings['strong'].append(item.official_color + ' ' + platform.platform_family_name) # positive weak matches for color in item.colors: if item.name and not is_platform_family_namesake: match_strings['weak'].append(color + ' ' + platform.platform_family_name) # positive minor matches if not is_platform_family_namesake: match_strings['minor'].append(platform.platform_family_name) # negative matches antimatch_strings = { 'anywhere': [], 'before': { 'space_separated_yes': [], 'space_separated_no': [] }, 'after': { 'space_separated_yes': [], 'space_separated_no': [] } } else: raise Exception() return match_strings