Example #1
0
    def _do_create_articles(cls, articles, deduplicate=True):
        """Check duplicates and save the articles to db.
        Does *not* save to elastic or add to articlesets
        Assumes that if .parent is given, it has an id
        (because parent is part of hash)
        modifies all articles in place with .hash, .id, .uuid, and .duplicate (None or Article)
        """
        es = amcates.ES()

        uuids = {} # {uuid : article}
        hashes = collections.defaultdict(list) # {hash: articles}
        
        # Iterate over articles, mark duplicates within addendum and build uuids/hashes dictionaries
        for a in articles:
            if a.id:
                raise ValueError("Specifying explicit article ID in save not allowed")
            a.es_dict = amcates.get_article_dict(a)
            a.hash = a.es_dict['hash']
            if not hasattr(a, 'uuid'): a.uuid = None
            a.duplicate, a.internal_duplicate = None, None # innocent until proven guilty
            if not deduplicate:
                continue
            if a.uuid:
                uuid = str(a.uuid)
                if uuid in uuids:
                    raise ValueError("Duplicate UUID in article upload")
                uuids[uuid] = a
            else: # articles with explicit uuid cannot be deduplicated on hash
                hashes[a.hash].append(a)
            
        def _set_dupe(dupe, orig):
            dupe.duplicate = orig
            dupe.id = orig.id
            dupe.uuid = orig.uuid
        # check dupes based on hash
        if hashes:
            results = es.query_all(filters={'hash': hashes.keys()},
                                   fields=["hash", "uuid"], score=False)
            for orig in results:
                for dupe in hashes[orig.hash]:
                    _set_dupe(dupe, orig)

        # check dupes based on uuid (if any are given)
        if uuids:
            results = es.query_all(filters={'uuid': uuids.keys()},
                                   fields=["hash", "uuid"], score=False)
            for orig in results:
                dupe = uuids[orig.uuid]
                if dupe.hash != orig.hash:
                    raise ValueError("Cannot modify existing articles: {orig.hash} != {dupe.hash}".format(**locals()))
                _set_dupe(dupe, orig)

        # now we can save the articles and set id
        to_insert = [a for a in articles if not a.duplicate]
        result = bulk_insert_returning_ids(to_insert)
        if len(to_insert) == 0:
            return []
        for a, inserted in zip(to_insert, result):
            a.id = inserted.id
        return to_insert
Example #2
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Example #3
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "section": "\u6f22\u5b57",
            "pagenr": 1928390,
            "headline": "Headline hier.",
            "byline": "byline..",
            "length": 1928,
            "metastring": "Even more strange characters.. \x0C ..",
            "url": "https://example.com",
            "externalid": None,
            "author": None,
            "addressee": "Hmm",
            "text": "Contains invalid char \x08 woo",
            "medium": create_test_medium(name="abc."),
            "project": create_test_project()
        })

        article.save()

        es = ES()
        es.add_articles([article.id])
        hash = get_article_dict(article)["hash"]
        es.flush()

        es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"])
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(_get_hash(es_article.to_dict()), hash)
Example #4
0
    def create_articles(cls, articles, articleset=None, check_duplicate=True):
        """
        Add the given articles to the database, the index, and the given set

        Article objects can contain a 'custom' nested_articles attribute. In that case,
        this should be a list of article-like objects that will also be saved, and will
        have the .parent set to the containing article
        
        @param articles: a collection of objects with the necessary properties (.headline etc)
        @param articleset: an articleset object
        @param check_duplicate: if True, duplicates are not added to the database or index
        (the 'existing' article *is* added to the set.
        """
        # TODO: test parent logic (esp. together with hash/dupes)
        es = amcates.ES()

        # add dict (+hash) as property on articles so we know who is who
        sets = [articleset.id] if articleset else None
        for a in articles:
            a.es_dict = amcates.get_article_dict(a, sets=sets)

        if check_duplicate:
            hashes = [a.es_dict['hash'] for a in articles]
            results =es.query(filters={'hashes' : hashes}, fields=["hash", "sets"], score=False) 
            dupes = {r.hash : r for r in results}
        else:
            dupes = {}

        # add all non-dupes to the db, needed actions        
        add_to_set = set() # duplicate article ids to add to set
        add_new_to_set = set() # new article ids to add to set
        add_to_index = [] # es_dicts to add to index
        for a in articles:
            dupe = dupes.get(a.es_dict['hash'], None)
            if dupe:
                a.duplicate_of = dupe.id
                if articleset and not (dupe.sets and articleset.id in dupe.sets):
                    add_to_set.add(dupe.id)
            else:
                if a.parent:
                    a.parent_id = a.parent.duplicate_of if hasattr(a.parent, 'duplicate_of') else a.parent.id
                a.save()
                a.es_dict['id'] = a.pk
                add_to_index.append(a.es_dict)
                add_new_to_set.add(a.pk)

        log.info("Considered {} articles: {} saved to db, {} new to add to index, {} duplicates to add to set"
                 .format(len(articles), len(add_new_to_set), len(add_to_index), len(add_to_set)))

        # add to index
        if add_to_index:
            es.bulk_insert(add_to_index)
                
        if articleset:
            # add to articleset (db and index)
            articleset.add_articles(add_to_set | add_new_to_set, add_to_index=False)
            es.add_to_set(articleset.id, add_to_set)
Example #5
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(**{
            "date": datetime.date(2015, 1, 1),
            "title": "\u6f22\u5b57",
            "text": "Even more strange characters.. \x0C and \x08 woo?",
            "url": "https://example.com",
            "project": create_test_project()
        })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article], articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]}, _source=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Example #6
0
    def test_elastic_hash(self):
        """Can we reproduce a hash from elastic data alone?"""
        article = Article(
            **{
                "date": datetime.date(2015, 1, 1),
                "title": "\u6f22\u5b57",
                "text": "Even more strange characters.. \x0C and \x08 woo?",
                "url": "https://example.com",
                "project": create_test_project()
            })

        hash = get_article_dict(article)['hash']
        Article.create_articles([article],
                                articleset=amcattest.create_test_set())
        ES().refresh()
        es_articles = ES().query_all(filters={"ids": [article.id]},
                                     fields=["hash"])
        es_articles = list(es_articles)
        es_article = list(es_articles)[0]

        self.assertEqual(article.id, es_article.id)
        self.assertEqual(hash, es_article.hash)
        self.assertEqual(hash, article.hash)
Example #7
0
 def get_article_dict(self, **kargs):
     return amcates.get_article_dict(self, **kargs)
Example #8
0
    def get_articles(self, fn,  media):
        csv.field_size_limit(sys.maxsize)
        def _int(x):
            return int(x) if x else None
        def hash2binary(hash):
            if hash:
                if not isinstance(hash, str):
                    raise TypeError("Hash should be str, not {}".format(type(hash)))
                return "\\x" + hash


        r = csv.reader(open(fn))
        header = next(r)
        index = {col: i for (i, col) in enumerate(header)}
        AID = index['article_id']
        if self.maxid:
            logging.info("*** max(id) set by user: {self.maxid}".format(**locals()))
            max_id, self.n_rows = self.maxid, self.maxid
        else:
            logging.info("*** Scan input CSV to determine #rows and max(id)")
            for row in r:
                max_id = max(max_id, int(row[AID]))
                self.n_rows += 1
                if not self.n_rows  % 10000000:
                    logging.info(".. scanned {self.n_rows} rows".format(**locals()))
            self.maxid = max_id
            
        logging.info("{self.n_rows} rows, max ID {max_id}, allocating memory for hashes".format(**locals()))

        hashes = ctypes.create_string_buffer(max_id*28)
        NULL_HASH = b'\x00' * 28
        orphans = "PLENTY"
        passno = 1

        if self._continue:
            logging.info("Continuing from previous migration, getting state from DB")
            with conn().cursor('migration-continue') as c:
                c.itersize = 10000 # how much records to buffer on a client
                c.execute("SELECT article_id, hash FROM articles")
                i = 0
                while True:
                    rows = c.fetchmany(10000)
                    if not rows:
                        break
                    i += len(rows)
                    if not i % 1000000:
                        logging.info("Retrieved {i} rows...".format(**locals()))
                    for (aid, hash) in rows:
                        offset = (aid - 1) * 28
                        hashes[offset:offset+28] = hash
            self.n_rows -= i
            logging.info("Continuing migration, {i} articles retrieved, up to {self.n_rows} to go".format(**locals()))
        
        while orphans:
            norphans = len(orphans) if isinstance(orphans, list) else orphans
            logging.info("*** Pass {passno}, #orphans {norphans}".format(**locals()))
            passno += 1

            if orphans == "PLENTY":
                r = csv.reader(open(fn))
                next(r) # skip header
                todo = r
            else:
                todo = orphans
            
            orphans = []
            MAX_ORPHANS_BUFFER = 50000
            
            for i, row in enumerate(todo):
                if not i % 1000000:
                    norphans = len(orphans) if isinstance(orphans, list) else orphans
                    logging.info("Row {i}, #orphans: {norphans}".format(**locals()))

                aid = int(row[AID])
                
                offset = (aid - 1) * 28
                stored_hash = hashes[offset:offset+28]
                if stored_hash != NULL_HASH:
                    continue
                
                parent_id = _int(row[index['parent_article_id']])
                if (parent_id == aid) or (parent_id in SKIP_PARENTS):
                    parent_id = None
                if parent_id:
                    poffset = (parent_id - 1) * 28
                    parent_hash = hashes[poffset:poffset+28]
                    if parent_hash == NULL_HASH:
                        # it's an orphan, can't process it now, so either buffer or re-iterate
                        if orphans != "PLENTY": # try to buffer
                            if len(orphans) > MAX_ORPHANS_BUFFER:
                                orphans = "PLENTY"
                            else:
                                orphans.append(row)
                        continue
                    parent_hash = binascii.hexlify(parent_hash).decode("ascii")
                else:
                    parent_hash = None

                date = row[index['date']]
                date = date.split("+")[0]
                date = datetime.strptime(date[:19], '%Y-%m-%d %H:%M:%S')

                
                a = Article(
                    project_id = row[index['project_id']],
                    date = date,
                    title = row[index['headline']],
                    url = row[index['url']] or None,
                    text = row[index['text']],
                    parent_hash=parent_hash)
                
                a.properties = {v: row[index[v]] for v in PROP_FIELDS if row[index[v]]}
                a.properties['medium'] = media[int(row[index['medium_id']])]
                a.properties['uuid'] = str(a.properties['uuid'])
                props = json.dumps(a.properties)
            
                hash = amcates.get_article_dict(a)['hash']
                hashes[offset:offset+28] = binascii.unhexlify(hash)

                yield (a.project_id, aid, a.date, a.title, a.url, a.text,
                       hash2binary(hash), hash2binary(a.parent_hash), props)
Example #9
0
    def create_articles(cls,
                        articles,
                        articleset=None,
                        check_duplicate=True,
                        create_id=False):
        """
        Add the given articles to the database, the index, and the given set

        Article objects can contain a 'custom' nested_articles attribute. In that case,
        this should be a list of article-like objects that will also be saved, and will
        have the .parent set to the containing article

        @param articles: a collection of objects with the necessary properties (.headline etc)
        @param articleset: an articleset object
        @param check_duplicate: if True, duplicates are not added to the database or index
        @param create_id: if True, also create articles that have an .id (for testing)
        (the 'existing' article *is* added to the set.
        """
        # TODO: test parent logic (esp. together with hash/dupes)
        es = amcates.ES()
        for a in articles:
            if a.length is None:
                a.length = word_len(a.text) + word_len(a.headline) + word_len(
                    a.byline)
        # existing / duplicate article ids to add to set
        add_to_set = set()
        # add dict (+hash) as property on articles so we know who is who
        sets = [articleset.id] if articleset else None
        todo = []
        for a in articles:
            if a.id and not create_id:
                # article already exists, only add to set
                add_to_set.add(a.id)
            else:
                a.es_dict = amcates.get_article_dict(a, sets=sets)
                todo.append(a)

        if check_duplicate:
            hashes = [a.es_dict['hash'] for a in todo]
            results = es.query_all(filters={'hashes': hashes},
                                   fields=["hash", "sets"],
                                   score=False)
            dupes = {r.hash: r for r in results}
        else:
            dupes = {}

        # add all non-dupes to the db, needed actions
        add_new_to_set = set()  # new article ids to add to set
        add_to_index = []  # es_dicts to add to index
        result = []  # return result
        errors = []  # return errors
        for a in todo:
            dupe = dupes.get(a.es_dict['hash'], None)
            a.duplicate = bool(dupe)
            if a.duplicate:
                a.id = dupe.id
                if articleset and not (dupe.sets
                                       and articleset.id in dupe.sets):
                    add_to_set.add(dupe.id)
            else:
                if a.parent:
                    a.parent_id = a.parent.id

                sid = transaction.savepoint()
                try:
                    a.save()
                    transaction.savepoint_commit(sid)
                except (IntegrityError, ValidationError, DatabaseError) as e:
                    log.warning(str(e))
                    transaction.savepoint_rollback(sid)
                    errors.append(e)
                    continue
                result.append(a)
                a.es_dict['id'] = a.pk
                add_to_index.append(a.es_dict)
                add_new_to_set.add(a.pk)

        log.info(
            "Considered {} articles: {} saved to db, {} new to add to index, {} existing/duplicates to add to set"
            .format(len(articles), len(add_new_to_set), len(add_to_index),
                    len(add_to_set)))

        # add to index
        if add_to_index:
            es.bulk_insert(add_to_index)
            log.info("Added {} to index".format(len(add_to_index)))

        if articleset:
            # add to articleset (db and index)
            articleset.add_articles(add_to_set | add_new_to_set,
                                    add_to_index=False)
            log.info("Added {} to db".format(len(add_to_set | add_new_to_set)))
            es.add_to_set(articleset.id, add_to_set)
            log.info("Added {} to elastic".format(len(add_to_set)))

        return result, errors
Example #10
0
    def get_articles(self, fn, media):
        csv.field_size_limit(sys.maxsize)

        def _int(x):
            return int(x) if x else None

        def hash2binary(hash):
            if hash:
                if not isinstance(hash, str):
                    raise TypeError("Hash should be str, not {}".format(
                        type(hash)))
                return "\\x" + hash

        r = csv.reader(open(fn))
        header = next(r)
        index = {col: i for (i, col) in enumerate(header)}
        AID = index['article_id']
        if self.maxid:
            logging.info(
                "*** max(id) set by user: {self.maxid}".format(**locals()))
            max_id, self.n_rows = self.maxid, self.maxid
        else:
            logging.info("*** Scan input CSV to determine #rows and max(id)")
            for row in r:
                max_id = max(max_id, int(row[AID]))
                self.n_rows += 1
                if not self.n_rows % 10000000:
                    logging.info(
                        ".. scanned {self.n_rows} rows".format(**locals()))

        logging.info(
            "{self.n_rows} rows, max ID {max_id}, allocating memory for hashes"
            .format(**locals()))

        hashes = ctypes.create_string_buffer(max_id * 28)
        NULL_HASH = b'\x00' * 28
        orphans = "N/A"
        passno = 1

        if self._continue:
            logging.info(
                "Continuing from previous migration, getting state from DB")
            c = conn().cursor('migration-continue')
            c.itersize = 10000  # how much records to buffer on a client
            c.execute("SELECT article_id, hash FROM articles")
            i = 0
            while True:
                rows = c.fetchmany(10000)
                if not rows:
                    break
                i += len(rows)
                if not i % 1000000:
                    logging.info("Retrieved {i} rows...")
                for (aid, hash) in rows:
                    offset = (aid - 1) * 28
                    hashes[offset:offset + 28] = hash
            self.n_rows -= i
            logging.info(
                "Continuing migration, {i} articles retrieved, {self.n_rows} to go"
                .format(**locals()))

        while orphans:
            logging.info(
                "*** Pass {passno}, #orphans {orphans}".format(**locals()))
            passno += 1
            orphans = 0

            r = csv.reader(open(fn))
            next(r)  # skip header

            for row in r:
                aid = int(row[AID])

                offset = (aid - 1) * 28
                stored_hash = hashes[offset:offset + 28]
                if stored_hash != NULL_HASH:
                    continue

                parent_id = _int(row[index['parent_article_id']])
                if (parent_id == aid) or (parent_id in SKIP_PARENTS):
                    parent_id = None
                if parent_id:
                    poffset = (parent_id - 1) * 28
                    parent_hash = hashes[poffset:poffset + 28]
                    if parent_hash == NULL_HASH:
                        orphans += 1
                        continue
                    parent_hash = binascii.hexlify(parent_hash).decode("ascii")
                else:
                    parent_hash = None

                date = row[index['date']]
                date = date.split("+")[0]
                date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

                a = Article(project_id=row[index['project_id']],
                            date=date,
                            title=row[index['headline']],
                            url=row[index['url']] or None,
                            text=row[index['text']],
                            parent_hash=parent_hash)

                a.properties = {
                    v: row[index[v]]
                    for v in PROP_FIELDS if row[index[v]]
                }
                a.properties['medium'] = media[int(row[index['medium_id']])]
                a.properties['uuid'] = str(a.properties['uuid'])
                props = json.dumps(a.properties)

                hash = amcates.get_article_dict(a)['hash']
                hashes[offset:offset + 28] = binascii.unhexlify(hash)

                yield (a.project_id, aid, a.date, a.title, a.url, a.text,
                       hash2binary(hash), hash2binary(a.parent_hash), props)
Example #11
0
    def create_articles(cls, articles, articleset=None, check_duplicate=True, create_id=False):
        """
        Add the given articles to the database, the index, and the given set

        Article objects can contain a 'custom' nested_articles attribute. In that case,
        this should be a list of article-like objects that will also be saved, and will
        have the .parent set to the containing article

        @param articles: a collection of objects with the necessary properties (.headline etc)
        @param articleset: an articleset object
        @param check_duplicate: if True, duplicates are not added to the database or index
        @param create_id: if True, also create articles that have an .id (for testing)
        (the 'existing' article *is* added to the set.
        """
        # TODO: test parent logic (esp. together with hash/dupes)
        es = amcates.ES()

        # existing / duplicate article ids to add to set
        add_to_set = set() 
        # add dict (+hash) as property on articles so we know who is who
        sets = [articleset.id] if articleset else None
        todo = []
        for a in articles:
            if a.id and not create_id:
                # article already exists, only add to set
                add_to_set.add(a.id)
            else:
                a.es_dict = amcates.get_article_dict(a, sets=sets)
                todo.append(a)

        if check_duplicate:
            hashes = [a.es_dict['hash'] for a in todo]
            results =es.query(filters={'hashes' : hashes}, fields=["hash", "sets"], score=False)
            dupes = {r.hash : r for r in results}
        else:
            dupes = {}

        # add all non-dupes to the db, needed actions
        add_new_to_set = set() # new article ids to add to set
        add_to_index = [] # es_dicts to add to index
        result = [] # return result
        errors = [] # return errors
        for a in todo:
            dupe = dupes.get(a.es_dict['hash'], None)
            if dupe:
                a.duplicate_of = dupe.id
                if articleset and not (dupe.sets and articleset.id in dupe.sets):
                    add_to_set.add(dupe.id)
            else:
                if a.parent:
                    a.parent_id = a.parent.duplicate_of if hasattr(a.parent, 'duplicate_of') else a.parent.id
                sid = transaction.savepoint()
                try:
                    sid = transaction.savepoint()
                    a.save()
                    transaction.savepoint_commit(sid)
                except (IntegrityError, ValidationError, DatabaseError) as e:
                    log.warning(str(e))
                    transaction.savepoint_rollback(sid)
                    errors.append(e)
                    continue
                result.append(a)
                a.es_dict['id'] = a.pk
                add_to_index.append(a.es_dict)
                add_new_to_set.add(a.pk)

        log.info("Considered {} articles: {} saved to db, {} new to add to index, {} existing/duplicates to add to set"
                 .format(len(articles), len(add_new_to_set), len(add_to_index), len(add_to_set)))

        # add to index
        if add_to_index:
            es.bulk_insert(add_to_index)

        if articleset:
            # add to articleset (db and index)
            articleset.add_articles(add_to_set | add_new_to_set, add_to_index=False)
            es.add_to_set(articleset.id, add_to_set)

        return result, errors
Example #12
0
    def create_articles(cls, articles, articleset=None, check_duplicate=True):
        """
        Add the given articles to the database, the index, and the given set

        Article objects can contain a 'custom' nested_articles attribute. In that case,
        this should be a list of article-like objects that will also be saved, and will
        have the .parent set to the containing article
        
        @param articles: a collection of objects with the necessary properties (.headline etc)
        @param articleset: an articleset object
        @param check_duplicate: if True, duplicates are not added to the database or index
        (the 'existing' article *is* added to the set.
        """
        # TODO: test parent logic (esp. together with hash/dupes)
        es = amcates.ES()

        # add dict (+hash) as property on articles so we know who is who
        sets = [articleset.id] if articleset else None
        for a in articles:
            a.es_dict = amcates.get_article_dict(a, sets=sets)

        if check_duplicate:
            hashes = [a.es_dict['hash'] for a in articles]
            results = es.query(filters={'hashes': hashes},
                               fields=["hash", "sets"],
                               score=False)
            dupes = {r.hash: r for r in results}
        else:
            dupes = {}

        # add all non-dupes to the db, needed actions
        add_to_set = set()  # duplicate article ids to add to set
        add_new_to_set = set()  # new article ids to add to set
        add_to_index = []  # es_dicts to add to index
        for a in articles:
            dupe = dupes.get(a.es_dict['hash'], None)
            if dupe:
                a.duplicate_of = dupe.id
                if articleset and not (dupe.sets
                                       and articleset.id in dupe.sets):
                    add_to_set.add(dupe.id)
            else:
                if a.parent:
                    a.parent_id = a.parent.duplicate_of if hasattr(
                        a.parent, 'duplicate_of') else a.parent.id
                a.save()
                a.es_dict['id'] = a.pk
                add_to_index.append(a.es_dict)
                add_new_to_set.add(a.pk)

        log.info(
            "Considered {} articles: {} saved to db, {} new to add to index, {} duplicates to add to set"
            .format(len(articles), len(add_new_to_set), len(add_to_index),
                    len(add_to_set)))

        # add to index
        if add_to_index:
            es.bulk_insert(add_to_index)

        if articleset:
            # add to articleset (db and index)
            articleset.add_articles(add_to_set | add_new_to_set,
                                    add_to_index=False)
            es.add_to_set(articleset.id, add_to_set)