コード例 #1
0
ファイル: select_nyt.py プロジェクト: fhenriquefv/Causeway
def doc_path_to_dict(path):
    directory, fname = os.path.split(path)
    reader = XMLCorpusReader(directory, fname)
    doc = reader.xml()
    try:
        return process_doc(doc)
    except ValueError, e:
        return e.args[0]
コード例 #2
0
ファイル: nycorp.py プロジェクト: MaiaHamin/juniorwork
    'paul krugman', 'maureen dowd', 'frank rich', 'verlyn klinkenborg',
    'adam cohen', 'lawrence downes'
]
roottest = './nyt_corpus/data/2005/**/**/'

nottestmode = False

authord = defaultdict(list)

icount = 0
ncount = 0
acount = 0
for filename in texts:
    reader = XMLCorpusReader(os.path.dirname(filename),
                             os.path.basename(filename))
    xml = reader.xml()
    ptext = ""
    desk = ""
    body = xml.find('body')
    head = xml.find('head')
    auth = body.find('body.head').find('byline')
    for d in head:
        if d.get("name") == "dsk":
            desk = d.get("content")
    if desk == "Editorial Desk":
        icount += 1
        try:
            if auth is not None:
                auth = auth.text
                if auth is not None:
                    acount += 1