def process_file(filename, outdir): citations = CitationSet() for (key, ref, val) in read_citations(filename): citations.add(key, format_ref(ref) + untexify(decode_utf(val))) testId = os.path.splitext(os.path.basename(filename))[0] citations.write_mapfile(os.path.join(outdir, '%s_map.txt' % testId)) citations.write_nlm(os.path.join(outdir, '%s.xml' % testId)) citations.write_txt(os.path.join(outdir, '%s.txt' % testId))
return parse(text, r'(?P<authors>.+?), \{\\em (?P<source>.+?)\}(?:, pages (?P<fpage>.+?)--(?P<lpage>.+?))? \((?P<year>.+?)\)\. (?P<title>.+)', {'authors': authors_parser_comma}) def get_mixed_citations(filename, parsing_fun): for (no,ref,val) in read_citations(filename): yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val)))) def limit(generator, no): for i in range(no): yield generator.next() indir = r'C:\Users\matfed\Desktop\bbls\\' files = [(r'abbrv.bbl', parse_abbrv), (r'acm.bbl', parse_acm), (r'alpha.bbl', parse_alpha), (r'apalike.bbl', parse_apalike), (r'ieeetr.bbl', parse_ieeetr), (r'jpc.bbl', parse_jpc), (r'pccp.bbl', parse_pccp), (r'plain.bbl', parse_plain), (r'ppcf.bbl', parse_ppcf), (r'revcompchem.bbl', parse_revcompchem)] set = CitationSet() for i, (filename, parsing_fun) in enumerate(files, 1): for k, v in limit(get_mixed_citations(indir + filename, parsing_fun), 100): set.add(str(int(k) + i * 10000), v) #set.write_nlm(r'C:\Users\matfed\Desktop\training.xml') set.write_txt(r'C:\Users\matfed\Desktop\parsertest.txt') set.write_mapfile(r'C:\Users\matfed\Desktop\parsertest_map.txt') pass
text, r'(?P<authors>.+?), \{\\em (?P<source>.+?)\}(?:, pages (?P<fpage>.+?)--(?P<lpage>.+?))? \((?P<year>.+?)\)\. (?P<title>.+)', {'authors': authors_parser_comma}) def get_mixed_citations(filename, parsing_fun): for (no, ref, val) in read_citations(filename): yield (no, format_ref(ref) + untexify(parsing_fun(decode_utf(val)))) def limit(generator, no): for i in range(no): yield generator.next() indir = r'C:\Users\matfed\Desktop\bbls\\' files = [(r'abbrv.bbl', parse_abbrv), (r'acm.bbl', parse_acm), (r'alpha.bbl', parse_alpha), (r'apalike.bbl', parse_apalike), (r'ieeetr.bbl', parse_ieeetr), (r'jpc.bbl', parse_jpc), (r'pccp.bbl', parse_pccp), (r'plain.bbl', parse_plain), (r'ppcf.bbl', parse_ppcf), (r'revcompchem.bbl', parse_revcompchem)] set = CitationSet() for i, (filename, parsing_fun) in enumerate(files, 1): for k, v in limit(get_mixed_citations(indir + filename, parsing_fun), 100): set.add(str(int(k) + i * 10000), v) #set.write_nlm(r'C:\Users\matfed\Desktop\training.xml') set.write_txt(r'C:\Users\matfed\Desktop\parsertest.txt') set.write_mapfile(r'C:\Users\matfed\Desktop\parsertest_map.txt') pass