Esempio n. 1
0
def _hash_dict(d):
    c = hash_class()
    for fn in sorted(d.keys()):
        c.update(_escape_bytes(_encode_field(fn)))
        c.update(_escape_bytes(_encode_field(d[fn])))
        c.update(b",")
    return c.hexdigest()
Esempio n. 2
0
    def hash_articles(cls, articleset: ArticleSet, ignore_fields: set) -> Iterable[Tuple[int, str]]:
        """
        Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically
        by field name. Fields in ignore_fields will not affect the hash.
        Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the
        values of thoses fields are equal in both articles.

        @param articleset       The articleset that is to be searched
        @param ignore_fields    A set of fields that should not be included in the calculated hashes

        @return                 An iterable of (<article_id>, <hash>) tuples.
        """
        all_fields = STATIC_FIELDS + list(articleset.get_used_properties())

        if not ignore_fields:
            fields = ["hash"]
        else:
            fields = sorted(f for f in all_fields if not f in ignore_fields)

        x = amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}},
                              fields=fields)
        for x in amcates.ES().scan(query={"query": {"constant_score": {"filter": {"term": {"sets": articleset.id}}}}},
                                   _source=fields):
            if not ignore_fields:
                yield int(x['_id']), x['_source']['hash']
                continue
            art_tuple = tuple(str(x['_source'].get(k, [None])) for k in fields)
            hash = hash_class(repr(art_tuple).encode()).hexdigest()
            yield int(x['_id']), hash
Esempio n. 3
0
    def get_hashes(self):
        fields = [
            f for f in FIELDS if not self.options.get("skip_{}".format(f))
        ]
        if fields == FIELDS:
            fields = ["hash"]
        setid = self.options['articleset'].id
        for x in amcates.ES().scan(query={
                "query": {
                    "constant_score": {
                        "filter": {
                            "term": {
                                "sets": setid
                            }
                        }
                    }
                }
        },
                                   fields=fields):
            if fields == ["hash"]:
                hash = x['fields']['hash'][0]
            else:

                def get(flds, f):
                    val = flds.get(f)
                    return val[0] if val is not None else val

                d = {f: get(x['fields'], f) for f in fields}
                hash = hash_class(json.dumps(d)).hexdigest()
            yield int(x['_id']), hash
Esempio n. 4
0
def _hash_dict(d):
    c = hash_class()
    for fn in sorted(d.keys()):
        c.update(_escape_bytes(_encode_field(fn)))
        c.update(_escape_bytes(_encode_field(d[fn])))
        c.update(b",")
    return c.hexdigest()
Esempio n. 5
0
def _get_legacy_hash(article_dict):
    c = hash_class()
    for k in LEGACY_HASH_FIELDS:
        v = article_dict[k]
        if isinstance(v, int):
            c.update(str(v))
        elif isinstance(v, unicode):
            c.update(v.encode('utf-8'))
        elif v is not None:
            c.update(v)
    return c.hexdigest()
Esempio n. 6
0
def _get_hash(article_dict):
    c =hash_class()
    keys = sorted(k for k in article_dict.keys() if k not in ('id', 'sets', 'hash', 'medium', 'projectid'))
    for k in keys:
        v = article_dict[k]
        if isinstance(v, int):
            c.update(str(v))
        elif isinstance(v, unicode):
            c.update(v.encode('utf-8'))
        elif v is not None:
            c.update(v)
    return c.hexdigest()
Esempio n. 7
0
def _get_hash(article_dict):
    c = hash_class()
    keys = sorted(k for k in article_dict.keys()
                  if k not in ('id', 'sets', 'hash', 'medium', 'projectid'))
    for k in keys:
        v = article_dict[k]
        if isinstance(v, int):
            c.update(str(v))
        elif isinstance(v, unicode):
            c.update(v.encode('utf-8'))
        elif v is not None:
            c.update(v)
    return c.hexdigest()
Esempio n. 8
0
    def hash_articles(cls, articleset: ArticleSet,
                      ignore_fields: set) -> Iterable[Tuple[int, str]]:
        """
        Finds all articles in an articleset, and hashes articles as a tuple of field values, ordered alphabetically
        by field name. Fields in ignore_fields will not affect the hash.
        Hashes for two articles are equal, if and only if for each field that is not in ignore_fields, the
        values of thoses fields are equal in both articles.

        @param articleset       The articleset that is to be searched
        @param ignore_fields    A set of fields that should not be included in the calculated hashes

        @return                 An iterable of (<article_id>, <hash>) tuples.
        """
        all_fields = STATIC_FIELDS + list(articleset.get_used_properties())

        if not ignore_fields:
            fields = ["hash"]
        else:
            fields = sorted(f for f in all_fields if not f in ignore_fields)

        x = amcates.ES().scan(query={
            "query": {
                "constant_score": {
                    "filter": {
                        "term": {
                            "sets": articleset.id
                        }
                    }
                }
            }
        },
                              fields=fields)
        for x in amcates.ES().scan(query={
                "query": {
                    "constant_score": {
                        "filter": {
                            "term": {
                                "sets": articleset.id
                            }
                        }
                    }
                }
        },
                                   fields=fields):
            if not ignore_fields:
                yield int(x['_id']), x['fields']['hash'][0]
                continue
            art_tuple = tuple(
                str(x['fields'].get(k, [None])[0]) for k in fields)
            hash = hash_class(repr(art_tuple).encode()).hexdigest()
            yield int(x['_id']), hash
Esempio n. 9
0
 def get_hashes(self):
     fields =  [f for f in FIELDS if not self.options.get("skip_{}".format(f))]
     if fields == FIELDS:
         fields = ["hash"]
     setid = self.options['articleset'].id
     for x in amcates.ES().scan(query={"query" : {"constant_score" : {"filter": {"term": {"sets": setid}}}}},
                                fields=fields):
         if fields == ["hash"]:
             hash = x['fields']['hash'][0]
         else:
             def get(flds, f):
                 val = flds.get(f)
                 return val[0] if val is not None else val
                 
             d = {f: get(x['fields'], f) for f in fields}
             hash = hash_class(json.dumps(d)).hexdigest()
         yield int(x['_id']), hash
Esempio n. 10
0
def _get_hash(article):
    article_dict = [(fn, article[fn]) for fn in HASH_FIELDS]
    return hash_class(json.dumps(article_dict)).hexdigest()