def doc_path_to_dict(path): directory, fname = os.path.split(path) reader = XMLCorpusReader(directory, fname) doc = reader.xml() try: return process_doc(doc) except ValueError, e: return e.args[0]
'paul krugman', 'maureen dowd', 'frank rich', 'verlyn klinkenborg', 'adam cohen', 'lawrence downes' ] roottest = './nyt_corpus/data/2005/**/**/' nottestmode = False authord = defaultdict(list) icount = 0 ncount = 0 acount = 0 for filename in texts: reader = XMLCorpusReader(os.path.dirname(filename), os.path.basename(filename)) xml = reader.xml() ptext = "" desk = "" body = xml.find('body') head = xml.find('head') auth = body.find('body.head').find('byline') for d in head: if d.get("name") == "dsk": desk = d.get("content") if desk == "Editorial Desk": icount += 1 try: if auth is not None: auth = auth.text if auth is not None: acount += 1