Beispiel #1
0
    def get_articles(self, fn,  media):
        csv.field_size_limit(sys.maxsize)
        def _int(x):
            return int(x) if x else None
        def hash2binary(hash):
            if hash:
                if not isinstance(hash, str):
                    raise TypeError("Hash should be str, not {}".format(type(hash)))
                return "\\x" + hash


        r = csv.reader(open(fn))
        header = next(r)
        index = {col: i for (i, col) in enumerate(header)}
        AID = index['article_id']
        if self.maxid:
            logging.info("*** max(id) set by user: {self.maxid}".format(**locals()))
            max_id, self.n_rows = self.maxid, self.maxid
        else:
            logging.info("*** Scan input CSV to determine #rows and max(id)")
            for row in r:
                max_id = max(max_id, int(row[AID]))
                self.n_rows += 1
                if not self.n_rows  % 10000000:
                    logging.info(".. scanned {self.n_rows} rows".format(**locals()))
            self.maxid = max_id
            
        logging.info("{self.n_rows} rows, max ID {max_id}, allocating memory for hashes".format(**locals()))

        hashes = ctypes.create_string_buffer(max_id*28)
        NULL_HASH = b'\x00' * 28
        orphans = "PLENTY"
        passno = 1

        if self._continue:
            logging.info("Continuing from previous migration, getting state from DB")
            with conn().cursor('migration-continue') as c:
                c.itersize = 10000 # how much records to buffer on a client
                c.execute("SELECT article_id, hash FROM articles")
                i = 0
                while True:
                    rows = c.fetchmany(10000)
                    if not rows:
                        break
                    i += len(rows)
                    if not i % 1000000:
                        logging.info("Retrieved {i} rows...".format(**locals()))
                    for (aid, hash) in rows:
                        offset = (aid - 1) * 28
                        hashes[offset:offset+28] = hash
            self.n_rows -= i
            logging.info("Continuing migration, {i} articles retrieved, up to {self.n_rows} to go".format(**locals()))
        
        while orphans:
            norphans = len(orphans) if isinstance(orphans, list) else orphans
            logging.info("*** Pass {passno}, #orphans {norphans}".format(**locals()))
            passno += 1

            if orphans == "PLENTY":
                r = csv.reader(open(fn))
                next(r) # skip header
                todo = r
            else:
                todo = orphans
            
            orphans = []
            MAX_ORPHANS_BUFFER = 50000
            
            for i, row in enumerate(todo):
                if not i % 1000000:
                    norphans = len(orphans) if isinstance(orphans, list) else orphans
                    logging.info("Row {i}, #orphans: {norphans}".format(**locals()))

                aid = int(row[AID])
                
                offset = (aid - 1) * 28
                stored_hash = hashes[offset:offset+28]
                if stored_hash != NULL_HASH:
                    continue
                
                parent_id = _int(row[index['parent_article_id']])
                if (parent_id == aid) or (parent_id in SKIP_PARENTS):
                    parent_id = None
                if parent_id:
                    poffset = (parent_id - 1) * 28
                    parent_hash = hashes[poffset:poffset+28]
                    if parent_hash == NULL_HASH:
                        # it's an orphan, can't process it now, so either buffer or re-iterate
                        if orphans != "PLENTY": # try to buffer
                            if len(orphans) > MAX_ORPHANS_BUFFER:
                                orphans = "PLENTY"
                            else:
                                orphans.append(row)
                        continue
                    parent_hash = binascii.hexlify(parent_hash).decode("ascii")
                else:
                    parent_hash = None

                date = row[index['date']]
                date = date.split("+")[0]
                date = datetime.strptime(date[:19], '%Y-%m-%d %H:%M:%S')

                
                a = Article(
                    project_id = row[index['project_id']],
                    date = date,
                    title = row[index['headline']],
                    url = row[index['url']] or None,
                    text = row[index['text']],
                    parent_hash=parent_hash)
                
                a.properties = {v: row[index[v]] for v in PROP_FIELDS if row[index[v]]}
                a.properties['medium'] = media[int(row[index['medium_id']])]
                a.properties['uuid'] = str(a.properties['uuid'])
                props = json.dumps(a.properties)
            
                hash = amcates.get_article_dict(a)['hash']
                hashes[offset:offset+28] = binascii.unhexlify(hash)

                yield (a.project_id, aid, a.date, a.title, a.url, a.text,
                       hash2binary(hash), hash2binary(a.parent_hash), props)
Beispiel #2
0
    def get_articles(self, fn, media):
        csv.field_size_limit(sys.maxsize)

        def _int(x):
            return int(x) if x else None

        def hash2binary(hash):
            if hash:
                if not isinstance(hash, str):
                    raise TypeError("Hash should be str, not {}".format(
                        type(hash)))
                return "\\x" + hash

        r = csv.reader(open(fn))
        header = next(r)
        index = {col: i for (i, col) in enumerate(header)}
        AID = index['article_id']
        if self.maxid:
            logging.info(
                "*** max(id) set by user: {self.maxid}".format(**locals()))
            max_id, self.n_rows = self.maxid, self.maxid
        else:
            logging.info("*** Scan input CSV to determine #rows and max(id)")
            for row in r:
                max_id = max(max_id, int(row[AID]))
                self.n_rows += 1
                if not self.n_rows % 10000000:
                    logging.info(
                        ".. scanned {self.n_rows} rows".format(**locals()))

        logging.info(
            "{self.n_rows} rows, max ID {max_id}, allocating memory for hashes"
            .format(**locals()))

        hashes = ctypes.create_string_buffer(max_id * 28)
        NULL_HASH = b'\x00' * 28
        orphans = "N/A"
        passno = 1

        if self._continue:
            logging.info(
                "Continuing from previous migration, getting state from DB")
            c = conn().cursor('migration-continue')
            c.itersize = 10000  # how much records to buffer on a client
            c.execute("SELECT article_id, hash FROM articles")
            i = 0
            while True:
                rows = c.fetchmany(10000)
                if not rows:
                    break
                i += len(rows)
                if not i % 1000000:
                    logging.info("Retrieved {i} rows...")
                for (aid, hash) in rows:
                    offset = (aid - 1) * 28
                    hashes[offset:offset + 28] = hash
            self.n_rows -= i
            logging.info(
                "Continuing migration, {i} articles retrieved, {self.n_rows} to go"
                .format(**locals()))

        while orphans:
            logging.info(
                "*** Pass {passno}, #orphans {orphans}".format(**locals()))
            passno += 1
            orphans = 0

            r = csv.reader(open(fn))
            next(r)  # skip header

            for row in r:
                aid = int(row[AID])

                offset = (aid - 1) * 28
                stored_hash = hashes[offset:offset + 28]
                if stored_hash != NULL_HASH:
                    continue

                parent_id = _int(row[index['parent_article_id']])
                if (parent_id == aid) or (parent_id in SKIP_PARENTS):
                    parent_id = None
                if parent_id:
                    poffset = (parent_id - 1) * 28
                    parent_hash = hashes[poffset:poffset + 28]
                    if parent_hash == NULL_HASH:
                        orphans += 1
                        continue
                    parent_hash = binascii.hexlify(parent_hash).decode("ascii")
                else:
                    parent_hash = None

                date = row[index['date']]
                date = date.split("+")[0]
                date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')

                a = Article(project_id=row[index['project_id']],
                            date=date,
                            title=row[index['headline']],
                            url=row[index['url']] or None,
                            text=row[index['text']],
                            parent_hash=parent_hash)

                a.properties = {
                    v: row[index[v]]
                    for v in PROP_FIELDS if row[index[v]]
                }
                a.properties['medium'] = media[int(row[index['medium_id']])]
                a.properties['uuid'] = str(a.properties['uuid'])
                props = json.dumps(a.properties)

                hash = amcates.get_article_dict(a)['hash']
                hashes[offset:offset + 28] = binascii.unhexlify(hash)

                yield (a.project_id, aid, a.date, a.title, a.url, a.text,
                       hash2binary(hash), hash2binary(a.parent_hash), props)