def run(input_filename, output_filename):
    tar = tarfile.open(input_filename, 'r:gz')
    record_file = tempfile.NamedTemporaryFile(delete=False)

    articles, without_identifiers = defaultdict(set), set()
    biggest = 0

    try:
        for i, (_, record) in enumerate(get_records(tar, types=SPLIT_OUT)):
            identifiers = [tidy_identifier(k, record[k]) for k in record if k in IDENTIFIERS]
            identifiers = filter(lambda x:x[1], identifiers)
            if record['type'] == 'Organization':
                identifiers.append(('org', hash((record.get('name'), record.get('address')))))
            if not identifiers:
                without_identifiers.add(record['id'])
                continue
            articles[identifiers[0]].add(record['id'])
            for identifier in identifiers[1:]:
                if articles[identifiers[0]] is not articles[identifier]:
                    articles[identifiers[0]] |= articles[identifier]
                    articles[identifier] = articles[identifiers[0]]
                    if len(articles[identifier]) > biggest:
                        biggest = len(articles[identifier])

            if i % 10000 == 0:
                print "%7d %7d %7d %7d %10d" % (i, len(articles), len(without_identifiers), biggest, resource.getrusage(resource.RUSAGE_SELF)[2])
                tar.members = []
    except BaseException, e:
        traceback.print_exc()
def run(input_filename, output_filename):
    tar = tarfile.open(input_filename, 'r:gz')
    record_file = tempfile.NamedTemporaryFile(delete=False)

    articles, without_identifiers = defaultdict(set), set()
    biggest = 0

    try:
        for i, (_, record) in enumerate(get_records(tar, types=SPLIT_OUT)):
            identifiers = [
                tidy_identifier(k, record[k]) for k in record
                if k in IDENTIFIERS
            ]
            identifiers = filter(lambda x: x[1], identifiers)
            if record['type'] == 'Organization':
                identifiers.append(
                    ('org', hash((record.get('name'), record.get('address')))))
            if not identifiers:
                without_identifiers.add(record['id'])
                continue
            articles[identifiers[0]].add(record['id'])
            for identifier in identifiers[1:]:
                if articles[identifiers[0]] is not articles[identifier]:
                    articles[identifiers[0]] |= articles[identifier]
                    articles[identifier] = articles[identifiers[0]]
                    if len(articles[identifier]) > biggest:
                        biggest = len(articles[identifier])

            if i % 10000 == 0:
                print "%7d %7d %7d %7d %10d" % (
                    i, len(articles), len(without_identifiers), biggest,
                    resource.getrusage(resource.RUSAGE_SELF)[2])
                tar.members = []
    except BaseException, e:
        traceback.print_exc()
    articles = dict((id(l), l) for l in articles.values()).values()
    groups = {}
    for i, article_list in enumerate(articles):
        for article_id in article_list:
            groups[article_id] = i
    for article in without_identifiers:
        i += 1
        groups[article] = i

    del without_identifiers, articles

    RELATIONS = 'author editor translator'.split()

    try:
        for i, (index, record) in enumerate(get_records(tar, True, types=SPLIT_OUT)):
            #pprint.pprint(index)
            records, group = [], groups[record['id']]
            to_add, queue, data = set(), set([record['id']]), {'group': group, 'records': records}

            while queue:
                id_ = queue.pop()
                record = index[id_]
                to_add.add(id_)
                for id_ in itertools.chain(*[(lambda x:(x if isinstance(x, list) else [x]))(record.get(k, [])) for k in RELATIONS]):
                    #print id_
                    id_ = id_['ref'][1:]
                    if id_ not in to_add:
                        if id_ in index and index[id_]['type'] not in SPLIT_OUT:
                            queue.add(id_)
    articles = dict((id(l), l) for l in articles.values()).values()
    groups = {}
    for i, article_list in enumerate(articles):
        for article_id in article_list:
            groups[article_id] = i
    for article in without_identifiers:
        i += 1
        groups[article] = i

    del without_identifiers, articles

    RELATIONS = 'author editor translator'.split()

    try:
        for i, (index,
                record) in enumerate(get_records(tar, True, types=SPLIT_OUT)):
            #pprint.pprint(index)
            records, group = [], groups[record['id']]
            to_add, queue, data = set(), set([record['id']]), {
                'group': group,
                'records': records
            }

            while queue:
                id_ = queue.pop()
                record = index[id_]
                to_add.add(id_)
                for id_ in itertools.chain(
                        *[(lambda x: (x if isinstance(x, list) else [x])
                           )(record.get(k, [])) for k in RELATIONS]):
                    #print id_