def main():

    parser = OUPJATSParser()

    documents = []

    do_refs = False

    for f in test_files:
        try:
            with open(f, 'r') as fp:
                input_data = fp.read()
                doc = parser.parse(input_data)
                documents.append(doc)
        except Exception as err:
            print("Error in MNRAS (OUP) parser:", f, err)

    # Write everything out in Classic tagged format
    serializer = Tagged()
    with open(outfile, 'w') as ftag:
        for d in documents:
            try:
                serializer.write(d, ftag)
            except Exception as err:
                print("Error in serialization:", err)
def main():

    documents = list()
    for f in inputFileList:
        print(f)
        try:
            with open(f, 'r') as fg:
                d = fg.read()
        except Exception as err:
            print(f)
            print("couldnt read it:", err)
        else:
            try:
                x = GCNCParser(d)
                documents.append(x.parse())
            except Exception as err:
                print(f)
                print("Couldnt parse it:", err)
    if documents:
        with open(outputTagFile, 'w') as fo:
            serializer = Tagged()
            for doc in documents:
                try:
                    serializer.write(doc, fo)
                except Exception as err:
                    print(err)
Exemple #3
0
    def test_classic_tagged(self):
        serializer = Tagged()
        for file in self.inputdocs:
            # this will raise exceptions if something is wrong
            document = ''
            with open(file, 'r') as fp:
                document = json.load(fp)
                self.assertIsNotNone(document, "%s: error reading doc" % file)
            outputfp = cStringIO.StringIO()
            serializer.write(document, outputfp)
            output = outputfp.getvalue()
            outputfp.close()
            self.assertNotEqual(output, '')
            basefile, _ = os.path.splitext(os.path.basename(file))
            target = os.path.join(self.outputdir, basefile + '.tag')
            # save temporary copy
            target_saved = target + '.parsed'
            with open(target_saved, 'w') as fp:
                fp.write(output)

            ok = False
            if os.path.exists(target):
                with open(target, 'r') as fp:
                    shouldbe = fp.read()
                    self.assertEqual(shouldbe, output, "results differ from %s" % target)
                    ok = True
            else:
                sys.stderr.write("could not find shouldbe file %s\n" % target)

            if ok:
                os.remove(target_saved)
            else:
                sys.stderr.write("parsed output saved to %s\n" % target_saved)
Exemple #4
0
def main():

    input_file = 'SAO_NASA_Jul_2020.UNX'
    parser = ProQuestParser(input_file)
    lol = parser.parse()
    print "%s records processed" % len(parser.results)
    tag = Tagged()
    outfile = 'lolproque.tag'
    with open(outfile, 'w') as fo:
        for rec in parser.results:
            tag.write(rec, fo)
def main():

    basedir = 'pyingest/tests/data/stubdata/input/'
    flist = glob(basedir + '*.gcn3')
    # flist = ['25548.gcn3','23456.gcn3','23457.gcn3','23458.gcn3','25321.gcn3','9999.gcn3','98765.gcn3']

    with open('output.tag', 'w') as fo:
        for f in flist:
            f2 = basedir + f
            try:
                with open(f2, 'rU') as fg:
                    try:
                        d = fg.read()
                    except Exception as err:
                        d = ''
                        print(f2)
                        print("couldnt read it:", err)
                try:
                    # d = namedentities.hex_entities(d)
                    d = repr(hex_entities(d))
                except Exception as err:
                    d = ''
                    print(f2)
                    print("Couldnt convert to hex:", err)
                try:
                    x = GCNCParser(d)
                except Exception as err:
                    print("failed at GCNCParser(d) step:", err)
                try:
                    y = x.parse()
                except Exception as err:
                    print("failed at x.parse step:", err)
                try:
                    serializer = Tagged()
                    serializer.write(y, fo)
                except Exception as err:
                    print(f2)
                    print("Couldnt serialize it:", err)
            except Exception as err:
                print("Problem parsing %s" % f2)
                print("Error: %s" % err)
def main():

    args = get_args()
    if args.volume:
        url = 'https://pos.sissa.it/' + args.volume

        parser = PoSParser()
        documents = parser.parse(url)

        if documents:
            if args.output:
                outfile = args.output
            else:
                outfile = 'PoS.' + args.volume + '.tag'
            outputfp = open(outfile, 'a')
            for d in documents:
                serializer = Tagged()
                serializer.write(d, outputfp)
            outputfp.close()
        else:
            print('No data extracted from pos.sissa.it.')
    else:
        print('You must provide a volume number, using -v ###')
basedir = '/proj/ads/articles/sources/STACKS/'

for issn in journal_ISSN.keys():
    b2 = basedir + issn
    vols = glob(b2 + '/*')
    v = vols[-1]
    papers = glob(v + '/*/*/*.xml')

    # Try the parser
    documents = []
    for p in papers:
        try:
            with open(p, 'rU') as fp:
                doc = parser.parse(fp)
            documents.append(doc)
        except Exception as e:
            print("Error in IOP parser:", p, e)

    # Write everything out in Classic tagged format
    fo = open(outfile, 'a')

    serializer = Tagged()

    for d in documents:
        print("KEYS:", d.keys())
        print(json.dumps(d, indent=4, sort_keys=True))
        # print("Hi, here's a document structure:\n%s\n\n\n"%d)
        # serializer.write(d, fo)
    fo.close()
from pyingest.parsers.aip import AIPJATSParser
from pyingest.serializers.classic import Tagged
from pyingest.serializers.refwriter import ReferenceWriter
import pyingest.config.config as config

# infile = '/proj/ads/abstracts/data/AIP/AIP.test/RSI/v91/i5/054901_1/Markup/VOR_10.1063_5.0005676.xml'
# infile = '/proj/ads/abstracts/data/AIP/JATS.0127/JCP/v154/i2/024904_1/Markup/VOR_10.1063_5.0033645.xml'
# infile = '/proj/ads/abstracts/data/AIP/JATS.0127/AJP/v89/i2/210_1/Markup/VOR_10.1119_10.0002365.xml'
infile = '/proj/ads/abstracts/data/AIP/JATS.0609.new/RSI/v92/i6/064704_1/Markup/VOR_10.1063_5.0044438.xml'

with open(infile, 'r') as fp:
    lol = AIPJATSParser()
    doc = lol.parse(fp)
    # print("Hi, ... %s" % doc)
    wut = Tagged()
    with open('test.tag', 'a') as fo:
        wut.write(doc, fo)
#   foo = ReferenceWriter()
#   foo.topdir = './'
# try:
#   foo.writeref(doc,'aip')
# except Exception as err:
# print("Error with writeref:", err)
Exemple #9
0
            # check archive: do we already have it?  If not, copy & parse it.
            if not os.path.exists(archive_file):
                abs_source = urllib.urlopen(absURL).read()
                open(archive_file, 'w').write(abs_source)
                pnas = PNASParser()
                output = pnas.parse(abs_source)
                records.append(output)
        except Exception, err:
            print("Error parsing %s: %s:" % (absURL, err))

if records:
    try:
        fo = open(outfile, 'a')
        for rec in records:
            try:
                serializer = Tagged()
                serializer.write(rec, fo)
            except Exception, err:
                print("Error in serializer: %s" % err)
            try:
                ref_handler = ReferenceWriter()
                ref_handler.writeref(rec, 'pnas')
            except Exception, err:
                print("Error in writeref: %s" % err)
        print("New PNAS records available in %s" % outfile)
        fo.close()
    except:
        print("Error writing PNAS records: %s" % err)
else:
    print("No new PNAS records available.")
Exemple #10
0
outfile = 'pnas.tag'
fo = open(outfile, 'a')

for k, v in PNAS_RSS_URLS.items():
    feed = feedparser.parse(v)
    # print "feed:",k
    for _item in feed['entries']:
        try:
            record = {}
            absURL = _item['link']
            volno = _item['prism_volume'].zfill(4)
            ident = _item['dc_identifier']
            ident = ident.replace('hwp:master-id:pnas;', '')
            # print absURL,volno,ident
            pnas = PNASParser()
            output = pnas.parse(absURL)
        except Exception as err:
            print("Error in parser:", err)
        else:
            try:
                serializer = Tagged()
                serializer.write(output, fo)
            except Exception as err:
                print("Error in serializer:", err)
            try:
                ref_handler = ReferenceWriter()
                ref_handler.writeref(output, 'pnas')
            except Exception as err:
                print("Error in writeref:", err)
fo.close()
from pyingest.parsers.adsfeedback import ADSFeedbackParser
from pyingest.serializers.classic import Tagged


infile = './feedback_test.json'

with open(infile,'r') as ff:
    data = ff.read()
    output = ADSFeedbackParser(data).parse()
    parser = Tagged()
    parser.write(output)