Beispiel #1
0
def process_file(filename, outdir):
    citations = CitationSet()
    for (key, ref, val) in read_citations(filename):
        citations.add(key, format_ref(ref) + untexify(decode_utf(val)))

    testId = os.path.splitext(os.path.basename(filename))[0]
    citations.write_mapfile(os.path.join(outdir, '%s_map.txt' % testId))
    citations.write_nlm(os.path.join(outdir, '%s.xml' % testId))
    citations.write_txt(os.path.join(outdir, '%s.txt' % testId))
def process_file(filename, outdir):
    citations = CitationSet()
    for (key, ref, val) in read_citations(filename):
        citations.add(key, format_ref(ref) + untexify(decode_utf(val)))

    testId = os.path.splitext(os.path.basename(filename))[0]
    citations.write_mapfile(os.path.join(outdir, '%s_map.txt' % testId))
    citations.write_nlm(os.path.join(outdir, '%s.xml' % testId))
    citations.write_txt(os.path.join(outdir, '%s.txt' % testId))
    return parse(text, r'(?P<authors>.+?), \{\\em (?P<source>.+?)\}(?:, pages (?P<fpage>.+?)--(?P<lpage>.+?))? \((?P<year>.+?)\)\. (?P<title>.+)', {'authors': authors_parser_comma})

def get_mixed_citations(filename, parsing_fun):
    for (no,ref,val) in read_citations(filename):
        yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val))))

def limit(generator, no):
    for i in range(no):
        yield generator.next() 

indir = r'C:\Users\matfed\Desktop\bbls\\'

files = [(r'abbrv.bbl', parse_abbrv),
         (r'acm.bbl', parse_acm),
         (r'alpha.bbl', parse_alpha),
         (r'apalike.bbl', parse_apalike),
         (r'ieeetr.bbl', parse_ieeetr),
         (r'jpc.bbl', parse_jpc),
         (r'pccp.bbl', parse_pccp),
         (r'plain.bbl', parse_plain),
         (r'ppcf.bbl', parse_ppcf),
         (r'revcompchem.bbl', parse_revcompchem)]

set = CitationSet()
for i, (filename, parsing_fun) in enumerate(files, 1):
    for k, v in limit(get_mixed_citations(indir + filename, parsing_fun), 100):
        set.add(str(int(k) + i * 10000), v)
#set.write_nlm(r'C:\Users\matfed\Desktop\training.xml')
set.write_txt(r'C:\Users\matfed\Desktop\parsertest.txt')
set.write_mapfile(r'C:\Users\matfed\Desktop\parsertest_map.txt')
pass
        text,
        r'(?P<authors>.+?), \{\\em (?P<source>.+?)\}(?:, pages (?P<fpage>.+?)--(?P<lpage>.+?))? \((?P<year>.+?)\)\. (?P<title>.+)',
        {'authors': authors_parser_comma})


def get_mixed_citations(filename, parsing_fun):
    for (no, ref, val) in read_citations(filename):
        yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val))))


def limit(generator, no):
    for i in range(no):
        yield generator.next()


indir = r'C:\Users\matfed\Desktop\bbls\\'

files = [(r'abbrv.bbl', parse_abbrv), (r'acm.bbl', parse_acm),
         (r'alpha.bbl', parse_alpha), (r'apalike.bbl', parse_apalike),
         (r'ieeetr.bbl', parse_ieeetr), (r'jpc.bbl', parse_jpc),
         (r'pccp.bbl', parse_pccp), (r'plain.bbl', parse_plain),
         (r'ppcf.bbl', parse_ppcf), (r'revcompchem.bbl', parse_revcompchem)]

set = CitationSet()
for i, (filename, parsing_fun) in enumerate(files, 1):
    for k, v in limit(get_mixed_citations(indir + filename, parsing_fun), 100):
        set.add(str(int(k) + i * 10000), v)
#set.write_nlm(r'C:\Users\matfed\Desktop\training.xml')
set.write_txt(r'C:\Users\matfed\Desktop\parsertest.txt')
set.write_mapfile(r'C:\Users\matfed\Desktop\parsertest_map.txt')
pass