def run(input_filename, output_filename): reader = itertools.imap(lambda a:Article(*[f.decode('utf-8') for f in a]), csv.reader(open(input_filename, 'r'))) writer = csv.writer(open(output_filename, 'w')) group_counter, split_counter = 0, 0 for i, (group_id, articles) in enumerate(itertools.groupby(reader, lambda a:a.group)): group_id, articles = int(group_id), list(articles) if i % 1000 == 0 and i: print "%8i %8i %8i %8i %8.5f%%" % (i, group_id, split_counter, group_counter, split_counter/group_counter*100) groups = list(recluster(articles)) groups.sort(key=lambda g:-len(g)) if len(groups) > 1 or sum(len(g) for g in groups[1:]) > 8: split_counter += 1 print len(groups), sorted(map(len, groups)) # for identifier in IDENTIFIERS: # print " ", identifier, [count(getattr(a, identifier) for a in g) for g in groups] split(groups) for group in groups: gc = unicode(group_counter) for article in group: article = article._replace(group = gc) writer.writerow([f.encode('utf-8') for f in article]) group_counter += 1
def run(input_filename, output_filename): reader = itertools.imap(lambda a: Article(*[f.decode('utf-8') for f in a]), csv.reader(open(input_filename, 'r'))) writer = csv.writer(open(output_filename, 'w')) group_counter, split_counter = 0, 0 for i, (group_id, articles) in enumerate(itertools.groupby(reader, lambda a: a.group)): group_id, articles = int(group_id), list(articles) if i % 1000 == 0 and i: print "%8i %8i %8i %8i %8.5f%%" % (i, group_id, split_counter, group_counter, split_counter / group_counter * 100) groups = list(recluster(articles)) groups.sort(key=lambda g: -len(g)) if len(groups) > 1 or sum(len(g) for g in groups[1:]) > 8: split_counter += 1 print len(groups), sorted(map(len, groups)) # for identifier in IDENTIFIERS: # print " ", identifier, [count(getattr(a, identifier) for a in g) for g in groups] split(groups) for group in groups: gc = unicode(group_counter) for article in group: article = article._replace(group=gc) writer.writerow([f.encode('utf-8') for f in article]) group_counter += 1
identifiers = [(k,v) for k,v in article._asdict().items() if k in IDENTIFIERS and v] data = None # dict(identifiers) if not identifiers: without_identifiers += 1 continue articles[identifiers[0]].append(article) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] += articles[identifier] articles[identifier] = articles[identifiers[0]] if i % 10000 == 0: print "%7d" % i except: pass i = 0 for group in articles.itervalues(): groups = recluster(group) for group in groups: for article in group: article = article._asdict() article['group'] = i article = Article(**article) writer.writerow(article) i += 1 if i % 10000 == 0: print "%7d" % i
identifiers = [(k, v) for k, v in article._asdict().items() if k in IDENTIFIERS and v] data = None # dict(identifiers) if not identifiers: without_identifiers += 1 continue articles[identifiers[0]].append(article) for identifier in identifiers[1:]: if articles[identifiers[0]] is not articles[identifier]: articles[identifiers[0]] += articles[identifier] articles[identifier] = articles[identifiers[0]] if i % 10000 == 0: print "%7d" % i except: pass i = 0 for group in articles.itervalues(): groups = recluster(group) for group in groups: for article in group: article = article._asdict() article['group'] = i article = Article(**article) writer.writerow(article) i += 1 if i % 10000 == 0: print "%7d" % i