def run(input_filename, output_filename):
    reader = itertools.imap(lambda a:Article(*[f.decode('utf-8') for f in a]), csv.reader(open(input_filename, 'r')))
    writer = csv.writer(open(output_filename, 'w'))

    group_counter, split_counter = 0, 0

    for i, (group_id, articles) in enumerate(itertools.groupby(reader, lambda a:a.group)):
        group_id, articles = int(group_id), list(articles)
        if i % 1000 == 0 and i:
            print "%8i %8i %8i %8i %8.5f%%" % (i, group_id, split_counter, group_counter, split_counter/group_counter*100)

        groups = list(recluster(articles))

        groups.sort(key=lambda g:-len(g))
        if len(groups) > 1 or sum(len(g) for g in groups[1:]) > 8:
            split_counter += 1
            print len(groups), sorted(map(len, groups))
#            for identifier in IDENTIFIERS:
#                print "  ", identifier, [count(getattr(a, identifier) for a in g) for g in groups]
            split(groups)


        for group in groups:
            gc = unicode(group_counter)
            for article in group:
                article = article._replace(group = gc)
                writer.writerow([f.encode('utf-8') for f in article])
            group_counter += 1
def run(input_filename, output_filename):
    reader = itertools.imap(lambda a: Article(*[f.decode('utf-8') for f in a]),
                            csv.reader(open(input_filename, 'r')))
    writer = csv.writer(open(output_filename, 'w'))

    group_counter, split_counter = 0, 0

    for i, (group_id,
            articles) in enumerate(itertools.groupby(reader,
                                                     lambda a: a.group)):
        group_id, articles = int(group_id), list(articles)
        if i % 1000 == 0 and i:
            print "%8i %8i %8i %8i %8.5f%%" % (i, group_id, split_counter,
                                               group_counter, split_counter /
                                               group_counter * 100)

        groups = list(recluster(articles))

        groups.sort(key=lambda g: -len(g))
        if len(groups) > 1 or sum(len(g) for g in groups[1:]) > 8:
            split_counter += 1
            print len(groups), sorted(map(len, groups))
            #            for identifier in IDENTIFIERS:
            #                print "  ", identifier, [count(getattr(a, identifier) for a in g) for g in groups]
            split(groups)

        for group in groups:
            gc = unicode(group_counter)
            for article in group:
                article = article._replace(group=gc)
                writer.writerow([f.encode('utf-8') for f in article])
            group_counter += 1
        identifiers = [(k,v) for k,v in article._asdict().items() if k in IDENTIFIERS and v]
        data = None # dict(identifiers)
        if not identifiers:
            without_identifiers += 1
            continue
        articles[identifiers[0]].append(article)
        for identifier in identifiers[1:]:
            if articles[identifiers[0]] is not articles[identifier]:
                articles[identifiers[0]] += articles[identifier]
                articles[identifier] = articles[identifiers[0]]

        if i % 10000 == 0:
            print "%7d" % i
except:
    pass


i = 0
for group in articles.itervalues():
    groups = recluster(group)
    for group in groups:
        for article in group:
            article = article._asdict()
            article['group'] = i
            article = Article(**article)
            writer.writerow(article)
        i += 1

    if i % 10000 == 0:
        print "%7d" % i
        identifiers = [(k, v) for k, v in article._asdict().items()
                       if k in IDENTIFIERS and v]
        data = None  # dict(identifiers)
        if not identifiers:
            without_identifiers += 1
            continue
        articles[identifiers[0]].append(article)
        for identifier in identifiers[1:]:
            if articles[identifiers[0]] is not articles[identifier]:
                articles[identifiers[0]] += articles[identifier]
                articles[identifier] = articles[identifiers[0]]

        if i % 10000 == 0:
            print "%7d" % i
except:
    pass

i = 0
for group in articles.itervalues():
    groups = recluster(group)
    for group in groups:
        for article in group:
            article = article._asdict()
            article['group'] = i
            article = Article(**article)
            writer.writerow(article)
        i += 1

    if i % 10000 == 0:
        print "%7d" % i