def main(): parser = OUPJATSParser() documents = [] do_refs = False for f in test_files: try: with open(f, 'r') as fp: input_data = fp.read() doc = parser.parse(input_data) documents.append(doc) except Exception as err: print("Error in MNRAS (OUP) parser:", f, err) # Write everything out in Classic tagged format serializer = Tagged() with open(outfile, 'w') as ftag: for d in documents: try: serializer.write(d, ftag) except Exception as err: print("Error in serialization:", err)
def main(): documents = list() for f in inputFileList: print(f) try: with open(f, 'r') as fg: d = fg.read() except Exception as err: print(f) print("couldnt read it:", err) else: try: x = GCNCParser(d) documents.append(x.parse()) except Exception as err: print(f) print("Couldnt parse it:", err) if documents: with open(outputTagFile, 'w') as fo: serializer = Tagged() for doc in documents: try: serializer.write(doc, fo) except Exception as err: print(err)
def test_classic_tagged(self): serializer = Tagged() for file in self.inputdocs: # this will raise exceptions if something is wrong document = '' with open(file, 'r') as fp: document = json.load(fp) self.assertIsNotNone(document, "%s: error reading doc" % file) outputfp = cStringIO.StringIO() serializer.write(document, outputfp) output = outputfp.getvalue() outputfp.close() self.assertNotEqual(output, '') basefile, _ = os.path.splitext(os.path.basename(file)) target = os.path.join(self.outputdir, basefile + '.tag') # save temporary copy target_saved = target + '.parsed' with open(target_saved, 'w') as fp: fp.write(output) ok = False if os.path.exists(target): with open(target, 'r') as fp: shouldbe = fp.read() self.assertEqual(shouldbe, output, "results differ from %s" % target) ok = True else: sys.stderr.write("could not find shouldbe file %s\n" % target) if ok: os.remove(target_saved) else: sys.stderr.write("parsed output saved to %s\n" % target_saved)
def main(): input_file = 'SAO_NASA_Jul_2020.UNX' parser = ProQuestParser(input_file) lol = parser.parse() print "%s records processed" % len(parser.results) tag = Tagged() outfile = 'lolproque.tag' with open(outfile, 'w') as fo: for rec in parser.results: tag.write(rec, fo)
def main(): basedir = 'pyingest/tests/data/stubdata/input/' flist = glob(basedir + '*.gcn3') # flist = ['25548.gcn3','23456.gcn3','23457.gcn3','23458.gcn3','25321.gcn3','9999.gcn3','98765.gcn3'] with open('output.tag', 'w') as fo: for f in flist: f2 = basedir + f try: with open(f2, 'rU') as fg: try: d = fg.read() except Exception as err: d = '' print(f2) print("couldnt read it:", err) try: # d = namedentities.hex_entities(d) d = repr(hex_entities(d)) except Exception as err: d = '' print(f2) print("Couldnt convert to hex:", err) try: x = GCNCParser(d) except Exception as err: print("failed at GCNCParser(d) step:", err) try: y = x.parse() except Exception as err: print("failed at x.parse step:", err) try: serializer = Tagged() serializer.write(y, fo) except Exception as err: print(f2) print("Couldnt serialize it:", err) except Exception as err: print("Problem parsing %s" % f2) print("Error: %s" % err)
def main(): args = get_args() if args.volume: url = 'https://pos.sissa.it/' + args.volume parser = PoSParser() documents = parser.parse(url) if documents: if args.output: outfile = args.output else: outfile = 'PoS.' + args.volume + '.tag' outputfp = open(outfile, 'a') for d in documents: serializer = Tagged() serializer.write(d, outputfp) outputfp.close() else: print('No data extracted from pos.sissa.it.') else: print('You must provide a volume number, using -v ###')
basedir = '/proj/ads/articles/sources/STACKS/' for issn in journal_ISSN.keys(): b2 = basedir + issn vols = glob(b2 + '/*') v = vols[-1] papers = glob(v + '/*/*/*.xml') # Try the parser documents = [] for p in papers: try: with open(p, 'rU') as fp: doc = parser.parse(fp) documents.append(doc) except Exception as e: print("Error in IOP parser:", p, e) # Write everything out in Classic tagged format fo = open(outfile, 'a') serializer = Tagged() for d in documents: print("KEYS:", d.keys()) print(json.dumps(d, indent=4, sort_keys=True)) # print("Hi, here's a document structure:\n%s\n\n\n"%d) # serializer.write(d, fo) fo.close()
from pyingest.parsers.aip import AIPJATSParser from pyingest.serializers.classic import Tagged from pyingest.serializers.refwriter import ReferenceWriter import pyingest.config.config as config # infile = '/proj/ads/abstracts/data/AIP/AIP.test/RSI/v91/i5/054901_1/Markup/VOR_10.1063_5.0005676.xml' # infile = '/proj/ads/abstracts/data/AIP/JATS.0127/JCP/v154/i2/024904_1/Markup/VOR_10.1063_5.0033645.xml' # infile = '/proj/ads/abstracts/data/AIP/JATS.0127/AJP/v89/i2/210_1/Markup/VOR_10.1119_10.0002365.xml' infile = '/proj/ads/abstracts/data/AIP/JATS.0609.new/RSI/v92/i6/064704_1/Markup/VOR_10.1063_5.0044438.xml' with open(infile, 'r') as fp: lol = AIPJATSParser() doc = lol.parse(fp) # print("Hi, ... %s" % doc) wut = Tagged() with open('test.tag', 'a') as fo: wut.write(doc, fo) # foo = ReferenceWriter() # foo.topdir = './' # try: # foo.writeref(doc,'aip') # except Exception as err: # print("Error with writeref:", err)
# check archive: do we already have it? If not, copy & parse it. if not os.path.exists(archive_file): abs_source = urllib.urlopen(absURL).read() open(archive_file, 'w').write(abs_source) pnas = PNASParser() output = pnas.parse(abs_source) records.append(output) except Exception, err: print("Error parsing %s: %s:" % (absURL, err)) if records: try: fo = open(outfile, 'a') for rec in records: try: serializer = Tagged() serializer.write(rec, fo) except Exception, err: print("Error in serializer: %s" % err) try: ref_handler = ReferenceWriter() ref_handler.writeref(rec, 'pnas') except Exception, err: print("Error in writeref: %s" % err) print("New PNAS records available in %s" % outfile) fo.close() except: print("Error writing PNAS records: %s" % err) else: print("No new PNAS records available.")
outfile = 'pnas.tag' fo = open(outfile, 'a') for k, v in PNAS_RSS_URLS.items(): feed = feedparser.parse(v) # print "feed:",k for _item in feed['entries']: try: record = {} absURL = _item['link'] volno = _item['prism_volume'].zfill(4) ident = _item['dc_identifier'] ident = ident.replace('hwp:master-id:pnas;', '') # print absURL,volno,ident pnas = PNASParser() output = pnas.parse(absURL) except Exception as err: print("Error in parser:", err) else: try: serializer = Tagged() serializer.write(output, fo) except Exception as err: print("Error in serializer:", err) try: ref_handler = ReferenceWriter() ref_handler.writeref(output, 'pnas') except Exception as err: print("Error in writeref:", err) fo.close()
from pyingest.parsers.adsfeedback import ADSFeedbackParser from pyingest.serializers.classic import Tagged infile = './feedback_test.json' with open(infile,'r') as ff: data = ff.read() output = ADSFeedbackParser(data).parse() parser = Tagged() parser.write(output)