def automatch_productions(releaser): unmatched_demozoo_prods, unmatched_janeway_prods, matched_prods = get_production_match_data( releaser) matched_production_count = len(matched_prods) unmatched_demozoo_production_count = len(unmatched_demozoo_prods) unmatched_janeway_production_count = len(unmatched_janeway_prods) # mapping of lowercased prod title to a pair of lists of demozoo IDs and pouet IDs of # prods with that name prods_by_name_and_supertype = defaultdict(lambda: ([], [])) for id, title, url, supertype in unmatched_demozoo_prods: prods_by_name_and_supertype[(generate_search_title(title), supertype)][0].append(id) for id, title, url, supertype in unmatched_janeway_prods: if supertype == 'music': title = strip_music_extensions(title) prods_by_name_and_supertype[(generate_search_title(title), supertype)][1].append(id) just_matched_janeway_ids = set() for (title, supertype), (demozoo_ids, janeway_ids) in prods_by_name_and_supertype.items(): if len(demozoo_ids) == 1 and len(janeway_ids) == 1: ProductionLink.objects.create( production_id=demozoo_ids[0], link_class='KestraBitworldRelease', parameter=janeway_ids[0], is_download_link=False, source='janeway-automatch', ) just_matched_janeway_ids.add(janeway_ids[0]) matched_production_count += 1 unmatched_demozoo_production_count -= 1 unmatched_janeway_production_count -= 1 if unmatched_demozoo_production_count == 0: # all matchable prods are accounted for, so let's go on and import the remaining ones from janeway for id, title, url, supertype in unmatched_janeway_prods: if id in just_matched_janeway_ids: continue import_release(JanewayRelease.objects.get(janeway_id=id)) matched_production_count += 1 unmatched_janeway_production_count -= 1 AuthorMatchInfo.objects.update_or_create( releaser_id=releaser.id, defaults={ 'matched_production_count': matched_production_count, 'unmatched_demozoo_production_count': unmatched_demozoo_production_count, 'unmatched_janeway_production_count': unmatched_janeway_production_count, })
def get_dz_releaser_ids_matching_by_name_and_type(janeway_author): """ Return a list of Demozoo releaser IDs which match this Janeway author by at least one name, and are the same type (scener or group) """ names = [generate_search_title(name.name) for name in janeway_author.names.all()] return list(Releaser.objects.filter( is_group=janeway_author.is_group, nicks__variants__search_title__in=names, ).distinct().values_list('id', flat=True))
def get_member_clean_names(self): """ return a list of cleaned versions of all names used by members of this group (excluding ones of <=3 letters) """ return list( filter(lambda s: len(s) > 3, [ generate_search_title(name.name) for name in Name.objects.filter(author__group_memberships__group=self) ]) )
def get_group_clean_names(self): """ return a list of cleaned versions of all names of groups this scener is a member of (excluding ones of <=3 letters) """ return list( filter(lambda s: len(s) > 3, [ generate_search_title(name.name) for name in Name.objects.filter(author__is_group=True, author__member_memberships__member=self) ]) )
def save(self, *args, **kwargs): self.search_title = generate_search_title(self.name) super().save(*args, **kwargs) # if name has changed, remove the old Name record if self.name != self._original_name: self.names.filter(name=self._original_name).delete() self._original_name = self.name # ensure that a Name with matching name exists for this BBS name, created = Name.objects.get_or_create(bbs=self, name=self.name)
def save(self, *args, **kwargs): # populate search_title from name if self.name: self.search_title = generate_search_title(self.name) super(Party, self).save(*args, **kwargs) if self.share_image_file_url and not self.share_image_file: # clear the previous share_image_file_url field Party.objects.filter(pk=self.pk).update(share_image_file_url='') self.share_image_file_url = '' elif self.share_image_file and self.share_image_file.url != self.share_image_file_url: # update the share_image_file_url field with the URL from share_image_file Party.objects.filter(pk=self.pk).update(share_image_file_url=self.share_image_file.url) self.share_image_file_url = self.share_image_file.url
def save(self, *args, **kwargs): # populate search_title from name if self.name: self.search_title = generate_search_title(self.name) super(Party, self).save(*args, **kwargs) if self.share_image_file_url and not self.share_image_file: # clear the previous share_image_file_url field Party.objects.filter(pk=self.pk).update(share_image_file_url='') self.share_image_file_url = '' elif self.share_image_file and self.share_image_file.url != self.share_image_file_url: # update the share_image_file_url field with the URL from share_image_file Party.objects.filter(pk=self.pk).update(share_image_file_url=self.share_image_file.url) self.share_image_file_url = self.share_image_file.url
def save(self, *args, **kwargs): if self.id and not self.supertype: self.supertype = self.inferred_supertype # populate search_title and sortable_title from title if self.title: self.title = self.title.strip() self.search_title = generate_search_title(self.title) self.sortable_title = generate_sort_key(self.title) # auto-populate updated_at; this will only happen on creation # because it's a non-null field at the db level if self.updated_at is None: self.updated_at = datetime.datetime.now() return super(Production, self).save(*args, **kwargs)
def save(self, *args, **kwargs): if self.id and not self.supertype: self.supertype = self.inferred_supertype # populate search_title and sortable_title from title if self.title: self.title = self.title.strip() self.search_title = generate_search_title(self.title) self.sortable_title = generate_sort_key(self.title) # auto-populate updated_at; this will only happen on creation # because it's a non-null field at the db level if self.updated_at is None: self.updated_at = datetime.datetime.now() return super(Production, self).save(*args, **kwargs)
def live_search(request): query = request.GET.get('q') category = request.GET.get('category') if query: clean_query = generate_search_title(query) # start with an empty queryset qs = Production.objects.annotate(type=models.Value( 'empty', output_field=models.CharField()), ).values('pk', 'type').none() if (not category) or category in ('production', 'graphics', 'music'): prod_qs = Production.objects.annotate( type=models.Value('production', output_field=models.CharField()), name=models.Value('', output_field=models.CharField()), ).order_by().filter(search_title__startswith=clean_query).values( 'pk', 'type') if category in ('production', 'graphics', 'music'): prod_qs = prod_qs.filter(supertype=category) qs = qs.union(prod_qs) if (not category) or category in ('scener', 'group'): releaser_qs = Releaser.objects.annotate( type=models.Value('releaser', output_field=models.CharField()), ).order_by('pk').filter( nicks__variants__search_title__startswith=clean_query).values( 'pk', 'type').distinct() if category in ('scener', 'group'): releaser_qs = releaser_qs.filter( is_group=(category == 'group')) qs = qs.union(releaser_qs) if (not category) or category == 'party': qs = qs.union( Party.objects.annotate(type=models.Value( 'party', output_field=models.CharField()), ).order_by().filter( search_title__startswith=clean_query).values( 'pk', 'type')) if (not category) or category == 'bbs': qs = qs.union( BBS.objects.annotate(type=models.Value( 'bbs', output_field=models.CharField()), ).order_by().filter( search_title__startswith=clean_query).values( 'pk', 'type')) search_result_data = list(qs[:10]) # Assemble the results into a plan for fetching the actual models - # form a dict that maps model/type to a set of PKs to_fetch = {} for d in search_result_data: to_fetch.setdefault(d['type'], set()).add(d['pk']) # now do the fetches, and store the results as a mapping of (type, pk) tuple to object fetched = {} if 'production' in to_fetch: production_ids = to_fetch['production'] productions = Production.objects.filter( pk__in=production_ids).prefetch_related( 'author_nicks__releaser', 'author_affiliation_nicks__releaser') screenshots = Screenshot.select_for_production_ids(production_ids) for prod in productions: prod.selected_screenshot = screenshots.get(prod.pk) fetched[('production', prod.pk)] = prod if 'releaser' in to_fetch: releasers = Releaser.objects.filter( pk__in=to_fetch['releaser']).prefetch_related( 'group_memberships__group__nicks', 'nicks') for releaser in releasers: fetched[('releaser', releaser.pk)] = releaser if 'party' in to_fetch: parties = Party.objects.filter(pk__in=to_fetch['party']) for party in parties: fetched[('party', party.pk)] = party if 'bbs' in to_fetch: bbses = BBS.objects.filter(pk__in=to_fetch['bbs']) for bbs in bbses: fetched[('bbs', bbs.pk)] = bbs # Build final list in same order as returned by the original results query results = [] for d in search_result_data: item = fetched.get((d['type'], d['pk'])) or None if item: if d['type'] == 'production': if item.selected_screenshot: screenshot = item.selected_screenshot width, height = screenshot.thumb_dimensions_to_fit( 48, 36) thumbnail = { 'url': screenshot.thumbnail_url, 'width': width, 'height': height, 'natural_width': screenshot.thumbnail_width, 'natural_height': screenshot.thumbnail_height, } else: thumbnail = None results.append({ 'type': item.supertype, 'url': item.get_absolute_url(), 'value': item.title_with_byline, 'thumbnail': thumbnail }) elif d['type'] == 'releaser': primary_nick = item.primary_nick if primary_nick.differentiator: differentiator = " (%s)" % primary_nick.differentiator else: differentiator = "" results.append({ 'type': 'group' if item.is_group else 'scener', 'url': item.get_absolute_url(), 'value': item.name_with_affiliations() + differentiator, }) elif d['type'] == 'party': results.append({ 'type': 'party', 'url': item.get_absolute_url(), 'value': item.name, }) elif d['type'] == 'bbs': results.append({ 'type': 'bbs', 'url': item.get_absolute_url(), 'value': item.name, }) else: results = [] return JsonResponse(results, safe=False)
def save(self, *args, **kwargs): # populate search_title from name if self.name: self.search_title = generate_search_title(self.name) return super(NickVariant, self).save(*args, **kwargs)
def search(self, page_number=1, count=50): query = self.cleaned_data['q'] # Look for filter expressions within query filter_expressions = collections.defaultdict(set) tag_names = set() def apply_filter(match): key, val = match.groups() if key in RECOGNISED_FILTER_KEYS: filter_expressions[key].add(val) return '' else: # the filter has not been recognised; # leave the original string intact to be handled as a search term return match.group(0) for filter_re in (FILTER_RE_ONEWORD, FILTER_RE_SINGLEQUOTE, FILTER_RE_DOUBLEQUOTE): query = filter_re.sub(apply_filter, query) def apply_tag(match): tag_names.add(match.group(1)) return '' query = TAG_RE.sub(apply_tag, query) asciified_query = unidecode(query).strip() has_search_term = bool(asciified_query) if has_search_term: psql_query = SearchQuery(unidecode(query)) clean_query = generate_search_title(query) production_filter_q = Q(search_document=psql_query) releaser_filter_q = Q(search_document=psql_query) party_filter_q = Q(search_document=psql_query) bbs_filter_q = Q(search_document=psql_query) else: production_filter_q = Q() releaser_filter_q = Q() party_filter_q = Q() bbs_filter_q = Q() subqueries_to_perform = set(['production', 'releaser', 'party', 'bbs']) if 'platform' in filter_expressions or 'on' in filter_expressions: subqueries_to_perform &= set(['production']) platforms = filter_expressions['platform'] | filter_expressions[ 'on'] platform_ids = Platform.objects.none().values_list('id', flat=True) for platform_name in platforms: platform_ids |= (Platform.objects.filter( Q(name__iexact=platform_name) | Q(aliases__name__iexact=platform_name)).values_list( 'id', flat=True)) production_filter_q &= Q(platforms__id__in=list(platform_ids)) if 'screenshot' in filter_expressions or 'screenshots' in filter_expressions: subqueries_to_perform &= set(['production']) for flag in filter_expressions['screenshot'] | filter_expressions[ 'screenshots']: if flag in ('yes', 'true'): production_filter_q &= Q(has_screenshot=True) elif flag in ('no', 'false'): production_filter_q &= Q(has_screenshot=False) if 'by' in filter_expressions or 'author' in filter_expressions: subqueries_to_perform &= set(['production']) for name in filter_expressions['by'] | filter_expressions['author']: clean_name = generate_search_title(name) production_filter_q &= ( # join back through releaser so that we match any nick variant ever used by the author, # not just the nick used on the prod. Better to err on the side of being too liberal Q(author_nicks__releaser__nicks__variants__search_title= clean_name) | Q(author_affiliation_nicks__releaser__nicks__variants__search_title =clean_name)) if 'of' in filter_expressions: subqueries_to_perform &= set(['releaser']) for name in filter_expressions['of']: clean_name = generate_search_title(name) releaser_filter_q &= Q( is_group=False, group_memberships__group__nicks__variants__search_title= clean_name) if 'group' in filter_expressions: subqueries_to_perform &= set(['production', 'releaser']) for name in filter_expressions['group']: clean_name = generate_search_title(name) releaser_filter_q &= Q( is_group=False, group_memberships__group__nicks__variants__search_title= clean_name) production_filter_q &= ( # join back through releaser so that we match any nick variant ever used by the author, # not just the nick used on the prod. Better to err on the side of being too liberal Q(author_nicks__releaser__is_group=True, author_nicks__releaser__nicks__variants__search_title= clean_name) | Q(author_affiliation_nicks__releaser__nicks__variants__search_title =clean_name)) if tag_names or ('tagged' in filter_expressions): subqueries_to_perform &= set(['production']) for tag_name in filter_expressions['tagged'] | tag_names: production_filter_q &= Q(tags__name=tag_name) if 'year' in filter_expressions or 'date' in filter_expressions: subqueries_to_perform &= set(['production', 'party']) for date_str in filter_expressions['year'] | filter_expressions[ 'date']: try: date_expr = FuzzyDate.parse(date_str) except ValueError: continue production_filter_q &= Q( release_date_date__gte=date_expr.date_range_start(), release_date_date__lte=date_expr.date_range_end()) party_filter_q &= Q( end_date_date__gte=date_expr.date_range_start(), start_date_date__lte=date_expr.date_range_end()) if 'before' in filter_expressions: subqueries_to_perform &= set(['production', 'party']) for date_str in filter_expressions['before']: try: date_expr = FuzzyDate.parse(date_str) except ValueError: continue production_filter_q &= Q( release_date_date__lt=date_expr.date_range_start()) party_filter_q &= Q( start_date_date__lt=date_expr.date_range_start()) if 'until' in filter_expressions: subqueries_to_perform &= set(['production', 'party']) for date_str in filter_expressions['until']: try: date_expr = FuzzyDate.parse(date_str) except ValueError: continue production_filter_q &= Q( release_date_date__lte=date_expr.date_range_end()) party_filter_q &= Q( start_date_date__lte=date_expr.date_range_end()) if 'after' in filter_expressions: subqueries_to_perform &= set(['production', 'party']) for date_str in filter_expressions['after']: try: date_expr = FuzzyDate.parse(date_str) except ValueError: continue production_filter_q &= Q( release_date_date__gt=date_expr.date_range_end()) party_filter_q &= Q( end_date_date__gt=date_expr.date_range_end()) if 'since' in filter_expressions: subqueries_to_perform &= set(['production', 'party']) for date_str in filter_expressions['since']: try: date_expr = FuzzyDate.parse(date_str) except ValueError: continue production_filter_q &= Q( release_date_date__gte=date_expr.date_range_start()) party_filter_q &= Q( end_date_date__gte=date_expr.date_range_start()) if 'type' in filter_expressions: requested_types = filter_expressions['type'] subqueries_from_type = set() filter_by_prod_supertype = False production_supertypes = [] for supertype in ('production', 'graphics', 'music'): if supertype in requested_types: filter_by_prod_supertype = True production_supertypes.append(supertype) if filter_by_prod_supertype: subqueries_from_type.add('production') production_filter_q &= Q(supertype__in=production_supertypes) if 'releaser' in requested_types or 'scener' in requested_types or 'group' in requested_types: subqueries_from_type.add('releaser') if 'scener' in requested_types and not ( 'releaser' in requested_types or 'group' in requested_types): releaser_filter_q &= Q(is_group=False) if 'group' in requested_types and not ( 'releaser' in requested_types or 'scener' in requested_types): releaser_filter_q &= Q(is_group=True) if 'party' in requested_types: subqueries_from_type.add('party') if 'bbs' in requested_types: subqueries_from_type.add('bbs') # assume that any otherwise-unrecognised 'type' values indicate a production type production_types = set() for val in requested_types: if val not in ('production', 'graphics', 'music', 'scener', 'group', 'releaser', 'party', 'bbs'): production_types.add(val) if production_types: prod_type_names_q = Q() for name in production_types: prod_type_names_q |= Q(name__iexact=name) prod_type_ids = ProductionType.objects.filter( prod_type_names_q).values_list('id', flat=True) subqueries_from_type.add('production') production_filter_q &= Q(types__in=prod_type_ids) subqueries_to_perform &= subqueries_from_type # Construct the master search query as a union of subqueries that search # one model each. Each subquery yields a queryset of dicts with the following fields: # 'type': 'production', 'releaser' or 'party' # 'pk': primary key of the relevant object # 'exactness': magic number used to prioritise exact/prefix title matches in the ordering: # 2 = (the cleaned version of) the title exactly matches (the cleaned verson of) the search query # 1 = (the cleaned version of) the title starts with (the cleaned version of) the search query # 0 = neither of the above # 'rank': search ranking as calculated by postgres search # start with an empty queryset if has_search_term: rank_annotation = SearchRank(F('search_document'), psql_query) else: rank_annotation = models.Value('', output_field=models.CharField()) qs = Production.objects.annotate( type=models.Value('empty', output_field=models.CharField()), exactness=models.Value(0, output_field=models.IntegerField()), rank=rank_annotation).values('pk', 'type', 'exactness', 'rank').none() if 'production' in subqueries_to_perform: # Search for productions if has_search_term: rank_annotation = SearchRank(F('search_document'), psql_query) exactness_annotation = models.Case( models.When(search_title=clean_query, then=models.Value(2)), models.When(search_title__startswith=clean_query, then=models.Value(1)), default=models.Value(0, output_field=models.IntegerField()), output_field=models.IntegerField()) else: rank_annotation = F('sortable_title') exactness_annotation = models.Value( 0, output_field=models.IntegerField()) qs = qs.union( Production.objects.annotate( rank=rank_annotation, type=models.Value('production', output_field=models.CharField()), exactness=exactness_annotation). filter(production_filter_q).order_by( # empty order_by to cancel the Production model's native ordering ).distinct().values('pk', 'type', 'exactness', 'rank')) if 'releaser' in subqueries_to_perform: # Search for releasers if has_search_term: rank_annotation = SearchRank(F('search_document'), psql_query) # Exactness test will be applied to each of the releaser's nick variants; # take the highest result exactness_annotation = models.Max( models.Case( models.When(nicks__variants__search_title=clean_query, then=models.Value(2)), models.When(nicks__variants__search_title__startswith= clean_query, then=models.Value(1)), default=models.Value( 0, output_field=models.IntegerField()), output_field=models.IntegerField())) else: rank_annotation = F('name') exactness_annotation = models.Value( 0, output_field=models.IntegerField()) qs = qs.union( Releaser.objects.annotate( rank=rank_annotation, type=models.Value('releaser', output_field=models.CharField()), exactness=exactness_annotation).filter(releaser_filter_q). distinct().order_by( # empty order_by to cancel the Releaser model's native ordering ).values('pk', 'type', 'exactness', 'rank')) if 'party' in subqueries_to_perform: # Search for parties if has_search_term: rank_annotation = SearchRank(F('search_document'), psql_query) exactness_annotation = models.Case( models.When(search_title=clean_query, then=models.Value(2)), models.When(search_title__startswith=clean_query, then=models.Value(1)), default=models.Value(0, output_field=models.IntegerField()), output_field=models.IntegerField()) else: rank_annotation = F('name') exactness_annotation = models.Value( 0, output_field=models.IntegerField()) qs = qs.union( Party.objects.annotate( rank=rank_annotation, type=models.Value('party', output_field=models.CharField()), exactness=exactness_annotation, ).filter(party_filter_q).order_by( # empty order_by to cancel the Party model's native ordering ).values('pk', 'type', 'exactness', 'rank'), ) if 'bbs' in subqueries_to_perform: # Search for BBSes if has_search_term: rank_annotation = SearchRank(F('search_document'), psql_query) exactness_annotation = models.Case( models.When(search_title=clean_query, then=models.Value(2)), models.When(search_title__startswith=clean_query, then=models.Value(1)), default=models.Value(0, output_field=models.IntegerField()), output_field=models.IntegerField()) else: rank_annotation = F('name') exactness_annotation = models.Value( 0, output_field=models.IntegerField()) qs = qs.union( BBS.objects.annotate( rank=rank_annotation, type=models.Value('bbs', output_field=models.CharField()), exactness=exactness_annotation, ).filter(bbs_filter_q).order_by( # empty order_by to cancel any model-level native ordering ).values('pk', 'type', 'exactness', 'rank'), ) if has_search_term: qs = qs.order_by('-exactness', '-rank', 'pk') else: qs = qs.order_by('-exactness', 'rank', 'pk') # Apply pagination to the query before performing the (expensive) real data fetches. paginator = Paginator(qs, count) # If page request (9999) is out of range, deliver last page of results. try: page = paginator.page(page_number) except (EmptyPage, InvalidPage): page = paginator.page(paginator.num_pages) # Assemble the results into a plan for fetching the actual models - # form a dict that maps model/type to a set of PKs to_fetch = {} for d in page.object_list: to_fetch.setdefault(d['type'], set()).add(d['pk']) # now do the fetches, and store the results as a mapping of (type, pk) tuple to object fetched = {} if 'production' in to_fetch: production_ids = to_fetch['production'] productions = Production.objects.filter( pk__in=production_ids).prefetch_related( 'author_nicks__releaser', 'author_affiliation_nicks__releaser') if has_search_term: productions = productions.annotate( search_snippet=TSHeadline('notes', psql_query)) screenshots = Screenshot.select_for_production_ids(production_ids) for prod in productions: prod.selected_screenshot = screenshots.get(prod.pk) # Ignore any search snippets that don't actually contain a highlighted term prod.has_search_snippet = has_search_term and '<b>' in prod.search_snippet fetched[('production', prod.pk)] = prod if 'releaser' in to_fetch: releasers = Releaser.objects.filter( pk__in=to_fetch['releaser']).prefetch_related( 'group_memberships__group__nicks', 'nicks') if has_search_term: releasers = releasers.annotate( search_snippet=TSHeadline('notes', psql_query)) for releaser in releasers: releaser.has_search_snippet = has_search_term and '<b>' in releaser.search_snippet fetched[('releaser', releaser.pk)] = releaser if 'party' in to_fetch: parties = Party.objects.filter(pk__in=to_fetch['party']) if has_search_term: parties = parties.annotate( search_snippet=TSHeadline('notes', psql_query)) for party in parties: party.has_search_snippet = has_search_term and '<b>' in party.search_snippet fetched[('party', party.pk)] = party if 'bbs' in to_fetch: bbses = BBS.objects.filter(pk__in=to_fetch['bbs']) if has_search_term: bbses = bbses.annotate( search_snippet=TSHeadline('notes', psql_query)) for bbs in bbses: bbs.has_search_snippet = has_search_term and '<b>' in bbs.search_snippet fetched[('bbs', bbs.pk)] = bbs # Build final list in same order as returned by the original results query results = [] for d in page.object_list: item = fetched.get((d['type'], d['pk'])) or None if item: item.search_info = d results.append(item) return (results, page)
def populate_search_title(apps, schema_editor): BBS = apps.get_model("bbs", "BBS") for bbs in BBS.objects.all(): bbs.search_title = generate_search_title(bbs.name) bbs.save(update_fields=['search_title'])
def save(self, *args, **kwargs): self.search_title = generate_search_title(self.name) return super().save(*args, **kwargs)
def save(self, *args, **kwargs): # populate search_title from name if self.name: self.search_title = generate_search_title(self.name) return super().save(*args, **kwargs)
def handle(self, *args, **kwargs): creation_count = 0 dupes_file = open('janeway_dupes.csv', mode='w') dupes_csv = csv.writer(dupes_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) dupes_csv.writerow(['Scener', 'Janeway URL', 'Demozoo URLs']) for (index, author) in enumerate(Author.objects.all()): if (index % 100 == 0): print("processed %d authors" % index) # find releasers in the Demozoo data that match any of this author's names (excluding abbreviations) # and are of the right type (scener vs group) candidate_releaser_ids = list(Releaser.objects.filter( is_group=author.is_group, nicks__variants__search_title__in=[generate_search_title(name.name) for name in author.names.all()] ).distinct().values_list('id', flat=True)) # print("-> %d name matches found: %r" % (len(candidate_releaser_ids), candidate_releaser_ids)) if not candidate_releaser_ids: continue if author.is_group: # get this group's member memberships member_ids = Author.objects.filter(group_memberships__group=author).values_list('janeway_id', flat=True) #member_names = [ # generate_search_title(name.name) # for name in Name.objects.filter(author__group_memberships__group=author) #] member_demozoo_ids = list(ReleaserExternalLink.objects.filter( link_class='KestraBitworldAuthor', parameter__in=[str(id) for id in member_ids] ).values_list('releaser_id', flat=True)) # see if any candidate releasers have one or more matching members by ID matching_releaser_ids = set(DZMembership.objects.filter( group_id__in=candidate_releaser_ids, member_id__in=member_demozoo_ids ).distinct().values_list('group_id', flat=True)) # see if any candidate releasers have one or more matching members by name # - commented out due to false positives, e.g. JP/Mayhem: # http://janeway.exotica.org.uk/author.php?id=29340 # https://demozoo.org/sceners/40395/ #matching_releaser_ids_by_group_name = set(DZMembership.objects.filter( # group_id__in=candidate_releaser_ids, # member__nicks__variants__search_title__in=member_names #).distinct().values_list('group_id', flat=True)) #if matching_releaser_ids: # print("group match for %s: %d => %r" % (author.name, author.janeway_id, matching_releaser_ids)) else: # get this author's group memberships group_ids = Author.objects.filter( is_group=True, member_memberships__member=author ).values_list('janeway_id', flat=True) #group_names = [ # generate_search_title(name.name) # for name in Name.objects.filter(author__is_group=True, author__member_memberships__member=author) #] group_demozoo_ids = list(ReleaserExternalLink.objects.filter( link_class='KestraBitworldAuthor', parameter__in=[str(id) for id in group_ids] ).values_list('releaser_id', flat=True)) # see if any candidate releasers have one or more matching groups by ID matching_releaser_ids = set(DZMembership.objects.filter( member_id__in=candidate_releaser_ids, group_id__in=group_demozoo_ids ).distinct().values_list('member_id', flat=True)) # see if any candidate releasers have one or more matching groups by name # - commented out due to false positives, e.g. JP/Mayhem: # http://janeway.exotica.org.uk/author.php?id=29340 # https://demozoo.org/sceners/40395/ #matching_releaser_ids_by_group_name = set(DZMembership.objects.filter( # member_id__in=candidate_releaser_ids, # group__nicks__variants__search_title__in=group_names #).distinct().values_list('member_id', flat=True)) #if matching_releaser_ids: # print("Found match for %s (%d): %r" % (author.name, author.janeway_id, matching_releaser_ids)) if len(matching_releaser_ids) > 1: dupes_csv.writerow( [author.name.encode('utf-8'), 'http://janeway.exotica.org.uk/author.php?id=%d' % author.janeway_id] + [ 'https://demozoo.org/%s/%d/' % ('groups' if author.is_group else 'sceners', id) for id in list(matching_releaser_ids) ] ) already_linked_releasers = list(ReleaserExternalLink.objects.filter( link_class='KestraBitworldAuthor', parameter=author.janeway_id ).values_list('releaser_id', flat=True)) for releaser_id in matching_releaser_ids: if releaser_id not in already_linked_releasers: ReleaserExternalLink.objects.create( releaser_id=releaser_id, link_class='KestraBitworldAuthor', parameter=author.janeway_id, source='janeway-automatch', ) creation_count += 1 dupes_file.close() print("%d cross-links created" % creation_count)
def search(self, with_real_names=False, page_number=1, count=50): query = self.cleaned_data['q'] # Look for filter expressions within query filter_expressions = collections.defaultdict(set) tag_names = set() def apply_filter(match): key, val = match.groups() if key in RECOGNISED_FILTER_KEYS: filter_expressions[key].add(val) return '' else: # the filter has not been recognised; # leave the original string intact to be handled as a search term return match.group(0) for filter_re in (FILTER_RE_ONEWORD, FILTER_RE_SINGLEQUOTE, FILTER_RE_DOUBLEQUOTE): query = filter_re.sub(apply_filter, query) def apply_tag(match): tag_names.add(match.group(1)) return '' query = TAG_RE.sub(apply_tag, query) psql_query = SearchQuery(unidecode(query)) clean_query = generate_search_title(query) rank_annotation = SearchRank(F('search_document'), psql_query) subqueries_to_perform = set(['production', 'releaser', 'party']) production_filter_q = Q(search_document=psql_query) if with_real_names: releaser_filter_q = Q(admin_search_document=psql_query) releaser_rank_annotation = SearchRank(F('admin_search_document'), psql_query) else: releaser_filter_q = Q(search_document=psql_query) releaser_rank_annotation = rank_annotation party_filter_q = Q(search_document=psql_query) if 'platform' in filter_expressions or 'on' in filter_expressions: subqueries_to_perform &= set(['production']) platforms = filter_expressions['platform'] | filter_expressions['on'] platform_ids = Platform.objects.none().values_list('id', flat=True) for platform_name in platforms: platform_ids |= Platform.objects.filter(Q(name__iexact=platform_name) | Q(aliases__name__iexact=platform_name)).values_list('id', flat=True) production_filter_q &= Q(platforms__id__in=list(platform_ids)) if 'by' in filter_expressions or 'author' in filter_expressions: subqueries_to_perform &= set(['production']) for name in filter_expressions['by'] | filter_expressions['author']: clean_name = generate_search_title(name) production_filter_q &= ( # join back through releaser so that we match any nick variant ever used by the author, # not just the nick used on the prod. Better to err on the side of being too liberal Q(author_nicks__releaser__nicks__variants__search_title=clean_name) | Q(author_affiliation_nicks__releaser__nicks__variants__search_title=clean_name) ) if 'of' in filter_expressions: subqueries_to_perform &= set(['releaser']) for name in filter_expressions['of']: clean_name = generate_search_title(name) releaser_filter_q &= Q( is_group=False, group_memberships__group__nicks__variants__search_title=clean_name ) if 'group' in filter_expressions: subqueries_to_perform &= set(['production', 'releaser']) for name in filter_expressions['group']: clean_name = generate_search_title(name) releaser_filter_q &= Q( is_group=False, group_memberships__group__nicks__variants__search_title=clean_name ) production_filter_q &= ( # join back through releaser so that we match any nick variant ever used by the author, # not just the nick used on the prod. Better to err on the side of being too liberal Q( author_nicks__releaser__is_group=True, author_nicks__releaser__nicks__variants__search_title=clean_name ) | Q( author_affiliation_nicks__releaser__nicks__variants__search_title=clean_name ) ) if tag_names or ('tagged' in filter_expressions): subqueries_to_perform &= set(['production']) for tag_name in filter_expressions['tagged'] | tag_names: production_filter_q &= Q(tags__name=tag_name) if 'year' in filter_expressions: subqueries_to_perform &= set(['production', 'party']) for year_str in filter_expressions['year']: try: year = int(year_str) except ValueError: continue production_filter_q &= Q(release_date_date__year=year) party_filter_q &= (Q(start_date_date__year=year) | Q(end_date_date__year=year)) if 'type' in filter_expressions: requested_types = filter_expressions['type'] subqueries_from_type = set() filter_by_prod_supertype = False production_supertypes = [] for supertype in ('production', 'graphics', 'music'): if supertype in requested_types: filter_by_prod_supertype = True production_supertypes.append(supertype) if filter_by_prod_supertype: subqueries_from_type.add('production') production_filter_q &= Q(supertype__in=production_supertypes) if 'releaser' in requested_types or 'scener' in requested_types or 'group' in requested_types: subqueries_from_type.add('releaser') if 'scener' in requested_types and not ('releaser' in requested_types or 'group' in requested_types): releaser_filter_q &= Q(is_group=False) if 'group' in requested_types and not ('releaser' in requested_types or 'scener' in requested_types): releaser_filter_q &= Q(is_group=True) if 'party' in requested_types: subqueries_from_type.add('party') # assume that any otherwise-unrecognised 'type' values indicate a production type production_types = set() for val in requested_types: if val not in ('production', 'graphics', 'music', 'scener', 'group', 'releaser', 'party'): production_types.add(val) if production_types: subqueries_from_type.add('production') production_filter_q &= Q(types__name__in=production_types) subqueries_to_perform &= subqueries_from_type # Construct the master search query as a union of subqueries that search # one model each. Each subquery yields a queryset of dicts with the following fields: # 'type': 'production', 'releaser' or 'party' # 'pk': primary key of the relevant object # 'exactness': magic number used to prioritise exact/prefix title matches in the ordering: # 2 = (the cleaned version of) the title exactly matches (the cleaned verson of) the search query # 1 = (the cleaned version of) the title starts with (the cleaned version of) the search query # 0 = neither of the above # 'rank': search ranking as calculated by postgres search # start with an empty queryset qs = Production.objects.annotate( type=models.Value('empty', output_field=models.CharField()), exactness=models.Value(0, output_field=models.IntegerField()), rank=rank_annotation ).values('pk', 'type', 'exactness', 'rank').none() if 'production' in subqueries_to_perform: # Search for productions qs = qs.union( Production.objects.annotate( rank=rank_annotation, type=models.Value('production', output_field=models.CharField()), exactness=models.Case( models.When(search_title=clean_query, then=models.Value(2)), models.When(search_title__startswith=clean_query, then=models.Value(1)), default=models.Value(0, output_field=models.IntegerField()), output_field=models.IntegerField() ) ).filter( production_filter_q ).order_by( # empty order_by to cancel the Production model's native ordering ).distinct().values('pk', 'type', 'exactness', 'rank') ) if 'releaser' in subqueries_to_perform: # Search for releasers qs = qs.union( Releaser.objects.annotate( rank=releaser_rank_annotation, type=models.Value('releaser', output_field=models.CharField()), # Exactness test will be applied to each of the releaser's nick variants; # take the highest result exactness=models.Max(models.Case( models.When(nicks__variants__search_title=clean_query, then=models.Value(2)), models.When(nicks__variants__search_title__startswith=clean_query, then=models.Value(1)), default=models.Value(0, output_field=models.IntegerField()), output_field=models.IntegerField() )) ).filter( releaser_filter_q ).order_by( # empty order_by to cancel the Releaser model's native ordering ).values('pk', 'type', 'exactness', 'rank') ) if 'party' in subqueries_to_perform: # Search for parties qs = qs.union( Party.objects.annotate( rank=rank_annotation, type=models.Value('party', output_field=models.CharField()), exactness=models.Case( models.When(search_title=clean_query, then=models.Value(2)), models.When(search_title__startswith=clean_query, then=models.Value(1)), default=models.Value(0, output_field=models.IntegerField()), output_field=models.IntegerField() ) ).filter( party_filter_q ).order_by( # empty order_by to cancel the Party model's native ordering ).values('pk', 'type', 'exactness', 'rank'), ) qs = qs.order_by('-exactness', '-rank', 'pk') # Apply pagination to the query before performing the (expensive) real data fetches. paginator = Paginator(qs, count) # If page request (9999) is out of range, deliver last page of results. try: page = paginator.page(page_number) except (EmptyPage, InvalidPage): page = paginator.page(paginator.num_pages) # Assemble the results into a plan for fetching the actual models - # form a dict that maps model/type to a set of PKs to_fetch = {} for d in page.object_list: to_fetch.setdefault(d['type'], set()).add(d['pk']) # now do the fetches, and store the results as a mapping of (type, pk) tuple to object fetched = {} if 'production' in to_fetch: production_ids = to_fetch['production'] productions = Production.objects.filter(pk__in=production_ids).prefetch_related( 'author_nicks__releaser', 'author_affiliation_nicks__releaser' ).annotate( search_snippet=TSHeadline('notes', psql_query) ) screenshots = Screenshot.select_for_production_ids(production_ids) for prod in productions: prod.selected_screenshot = screenshots.get(prod.pk) # Ignore any search snippets that don't actually contain a highlighted term prod.has_search_snippet = '<b>' in prod.search_snippet fetched[('production', prod.pk)] = prod if 'releaser' in to_fetch: releasers = Releaser.objects.filter(pk__in=to_fetch['releaser']).prefetch_related( 'group_memberships__group__nicks', 'nicks' ).annotate( search_snippet=TSHeadline('notes', psql_query) ) for releaser in releasers: releaser.has_search_snippet = '<b>' in releaser.search_snippet fetched[('releaser', releaser.pk)] = releaser if 'party' in to_fetch: parties = Party.objects.filter(pk__in=to_fetch['party']).annotate( search_snippet=TSHeadline('notes', psql_query) ) for party in parties: party.has_search_snippet = '<b>' in party.search_snippet fetched[('party', party.pk)] = party # Build final list in same order as returned by the original results query results = [] for d in page.object_list: item = fetched.get((d['type'], d['pk'])) or None if item: item.search_info = d results.append(item) return (results, page)