def _hash_dict(d): c = hash_class() for fn in sorted(d.keys()): c.update(_escape_bytes(_encode_field(fn))) c.update(_escape_bytes(_encode_field(d[fn]))) c.update(b",") return c.hexdigest()
def hash_articles(cls, articleset: ArticleSet, ignore_fields: set) -> Iterable[Tuple[int, str]]: """ Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically by field name. Fields in ignore_fields will not affect the hash. Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the values of thoses fields are equal in both articles. @param articleset The articleset that is to be searched @param ignore_fields A set of fields that should not be included in the calculated hashes @return An iterable of (<article_id>, <hash>) tuples. """ all_fields = STATIC_FIELDS + list(articleset.get_used_properties()) if not ignore_fields: fields = ["hash"] else: fields = sorted(f for f in all_fields if not f in ignore_fields) x = amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}}, fields=fields) for x in amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}}, _source=fields): if not ignore_fields: yield int(x['_id']), x['_source']['hash'] continue art_tuple = tuple(str(x['_source'].get(k, [None])) for k in fields) hash = hash_class(repr(art_tuple).encode()).hexdigest() yield int(x['_id']), hash
def get_hashes(self): fields = [ f for f in FIELDS if not self.options.get("skip_{}".format(f)) ] if fields == FIELDS: fields = ["hash"] setid = self.options['articleset'].id for x in amcates.ES().scan(query={ "query": { "constant_score": { "filter": { "term": { "sets": setid } } } } }, fields=fields): if fields == ["hash"]: hash = x['fields']['hash'][0] else: def get(flds, f): val = flds.get(f) return val[0] if val is not None else val d = {f: get(x['fields'], f) for f in fields} hash = hash_class(json.dumps(d)).hexdigest() yield int(x['_id']), hash
def _get_legacy_hash(article_dict): c = hash_class() for k in LEGACY_HASH_FIELDS: v = article_dict[k] if isinstance(v, int): c.update(str(v)) elif isinstance(v, unicode): c.update(v.encode('utf-8')) elif v is not None: c.update(v) return c.hexdigest()
def _get_hash(article_dict): c =hash_class() keys = sorted(k for k in article_dict.keys() if k not in ('id', 'sets', 'hash', 'medium', 'projectid')) for k in keys: v = article_dict[k] if isinstance(v, int): c.update(str(v)) elif isinstance(v, unicode): c.update(v.encode('utf-8')) elif v is not None: c.update(v) return c.hexdigest()
def _get_hash(article_dict): c = hash_class() keys = sorted(k for k in article_dict.keys() if k not in ('id', 'sets', 'hash', 'medium', 'projectid')) for k in keys: v = article_dict[k] if isinstance(v, int): c.update(str(v)) elif isinstance(v, unicode): c.update(v.encode('utf-8')) elif v is not None: c.update(v) return c.hexdigest()
def hash_articles(cls, articleset: ArticleSet, ignore_fields: set) -> Iterable[Tuple[int, str]]: """ Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically by field name. Fields in ignore_fields will not affect the hash. Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the values of thoses fields are equal in both articles. @param articleset The articleset that is to be searched @param ignore_fields A set of fields that should not be included in the calculated hashes @return An iterable of (<article_id>, <hash>) tuples. """ all_fields = STATIC_FIELDS + list(articleset.get_used_properties()) if not ignore_fields: fields = ["hash"] else: fields = sorted(f for f in all_fields if not f in ignore_fields) x = amcates.ES().scan(query={ "query": { "constant_score": { "filter": { "term": { "sets": articleset.id } } } } }, fields=fields) for x in amcates.ES().scan(query={ "query": { "constant_score": { "filter": { "term": { "sets": articleset.id } } } } }, fields=fields): if not ignore_fields: yield int(x['_id']), x['fields']['hash'][0] continue art_tuple = tuple( str(x['fields'].get(k, [None])[0]) for k in fields) hash = hash_class(repr(art_tuple).encode()).hexdigest() yield int(x['_id']), hash
def get_hashes(self): fields = [f for f in FIELDS if not self.options.get("skip_{}".format(f))] if fields == FIELDS: fields = ["hash"] setid = self.options['articleset'].id for x in amcates.ES().scan(query={"query" : {"constant_score" : {"filter": {"term": {"sets": setid}}}}}, fields=fields): if fields == ["hash"]: hash = x['fields']['hash'][0] else: def get(flds, f): val = flds.get(f) return val[0] if val is not None else val d = {f: get(x['fields'], f) for f in fields} hash = hash_class(json.dumps(d)).hexdigest() yield int(x['_id']), hash
def _get_hash(article): article_dict = [(fn, article[fn]) for fn in HASH_FIELDS] return hash_class(json.dumps(article_dict)).hexdigest()