def splitByYear(fname, outpath): '''Split a .xml file into several .txt files by years''' start = time() context = ET.iterparse(fname, events=('end', )) i = 0 #j = 0 for event, elem in context: # get date if elem.tag.count("created"): year = elem.text[:4] fname = outpath + '/all_' + year + '.txt' i += 1 if i % 10000 == 0: print i # get text elif elem.tag.count("text"): text = elem.text + '\n' elif elem.tag.count("categories"): ioFile.dataToFile(elem.text + '\t', fname) # elif elem.tag.count('msc-class'): # print elem.text elif elem.tag.count('acm-class'): if elem.text is not None: ioFile.dataToFile(elem.text + '\t', fname) else: ioFile.dataToFile('\t', fname) ''' # get id elif elem.tag.count('id'): text_id = elem.text+'\t' if i == j+1: ioFile.dataToFile(text_id+text, fname) j = i ''' # clear elements to empty memory while elem.getprevious() is not None: del elem.getparent()[0] elem.clear() del context end = time() print "use time:", end - start
def splitByYear(fname, outpath): '''Split a .xml file into several .txt files by years''' start = time() context = ET.iterparse(fname, events=('end',)) i = 0 #j = 0 for event, elem in context: # get date if elem.tag.count("created"): year = elem.text[:4] fname = outpath + '/all_' + year + '.txt' i += 1 if i % 10000 == 0: print i # get text elif elem.tag.count("text"): text = elem.text + '\n' elif elem.tag.count("categories"): ioFile.dataToFile(elem.text+'\t', fname) # elif elem.tag.count('msc-class'): # print elem.text elif elem.tag.count('acm-class'): if elem.text is not None: ioFile.dataToFile(elem.text+'\t', fname) else: ioFile.dataToFile('\t', fname) ''' # get id elif elem.tag.count('id'): text_id = elem.text+'\t' if i == j+1: ioFile.dataToFile(text_id+text, fname) j = i ''' # clear elements to empty memory while elem.getprevious() is not None: del elem.getparent()[0] elem.clear() del context end = time() print "use time:", end - start
clf = re.sub(r"[0-9]{2}[.][0-9]{2}[+][EN]", '', clf) clf = clf.replace('X', '') f = re.findall(r"([A-Z])[ ,.]{0,1}([0-9M]{0,1})[ ,.]{0,1}([0-9]{0,2}[M]{0,1})", clf) for level in f: i = 3 # if a class is not in the dictionary, try its parent class while i > 0: c = '.'.join([l for l in level[:i] if l is not None and l != '']) if len(c) == 1: c += '.' if acm_class_dict.keys().count(c) > 0: clfs.append(c) break i -= 1 ''' if i != 3: print i, '|', tmp, '|', c ''' clf = ' '.join(clfs) # the same document with different classifications if clf_dict.keys().count(line[2]) > 0: if clf != '': clf_dict[line[2]] = ' '.join(set([clf_dict[line[2]], clf])) else: clf_dict[line[2]] = clf # keep the order of text is the same as the order of posterior matrix data_iterator = inFile for line in data_iterator: ioFile.dataToFile(clf_dict[line]+'\n', outFile)