Esempio n. 1
0
def splitByYear(fname, outpath):
    '''Split a .xml file into several .txt files by years'''
    start = time()
    context = ET.iterparse(fname, events=('end', ))
    i = 0
    #j = 0

    for event, elem in context:
        # get date
        if elem.tag.count("created"):
            year = elem.text[:4]
            fname = outpath + '/all_' + year + '.txt'
            i += 1
            if i % 10000 == 0:
                print i
        # get text
        elif elem.tag.count("text"):
            text = elem.text + '\n'
        elif elem.tag.count("categories"):
            ioFile.dataToFile(elem.text + '\t', fname)


#        elif elem.tag.count('msc-class'):
#            print elem.text
        elif elem.tag.count('acm-class'):
            if elem.text is not None:
                ioFile.dataToFile(elem.text + '\t', fname)
            else:
                ioFile.dataToFile('\t', fname)
        '''
        # get id
        elif elem.tag.count('id'):
            text_id = elem.text+'\t'

        if i == j+1:
            ioFile.dataToFile(text_id+text, fname)
            j = i
        '''

        # clear elements to empty memory
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        elem.clear()

    del context

    end = time()
    print "use time:", end - start
Esempio n. 2
0
def splitByYear(fname, outpath):
    '''Split a .xml file into several .txt files by years'''
    start = time()
    context = ET.iterparse(fname, events=('end',))
    i = 0
    #j = 0
    
    for event, elem in context:
        # get date
        if elem.tag.count("created"):
            year = elem.text[:4]
            fname = outpath + '/all_' + year + '.txt'
            i += 1
            if i % 10000 == 0:
                print i
        # get text
        elif elem.tag.count("text"):
            text = elem.text + '\n'  
        elif elem.tag.count("categories"):
            ioFile.dataToFile(elem.text+'\t', fname)
            
#        elif elem.tag.count('msc-class'):
#            print elem.text
        elif elem.tag.count('acm-class'):
            if elem.text is not None:
                ioFile.dataToFile(elem.text+'\t', fname)
            else:
                ioFile.dataToFile('\t', fname)        
        '''
        # get id
        elif elem.tag.count('id'):
            text_id = elem.text+'\t'

        if i == j+1:
            ioFile.dataToFile(text_id+text, fname)
            j = i
        '''
        
        # clear elements to empty memory
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        elem.clear()
        
    del context
      
    end = time()
    print "use time:", end - start
                        clf = re.sub(r"[0-9]{2}[.][0-9]{2}[+][EN]", '', clf)
                        clf = clf.replace('X', '')
                        f = re.findall(r"([A-Z])[ ,.]{0,1}([0-9M]{0,1})[ ,.]{0,1}([0-9]{0,2}[M]{0,1})", clf)
                        for level in f:
                            i = 3
                            # if a class is not in the dictionary, try its parent class
                            while i > 0:
                                c = '.'.join([l for l in level[:i] if l is not None and l != ''])
                                if len(c) == 1: c += '.'                               
                                if acm_class_dict.keys().count(c) > 0:
                                    clfs.append(c)
                                    break
                                i -= 1
                            '''
                            if i != 3:
                                print i, '|', tmp, '|', c
                            '''
                    clf = ' '.join(clfs)
        # the same document with different classifications
        if clf_dict.keys().count(line[2]) > 0:
            if clf != '':
                clf_dict[line[2]] = ' '.join(set([clf_dict[line[2]], clf]))
        else:
            clf_dict[line[2]] = clf


    # keep the order of text is the same as the order of posterior matrix
    data_iterator = inFile
    for line in data_iterator:
        ioFile.dataToFile(clf_dict[line]+'\n', outFile)