def test_parse_patent(self):
     testdir = os.path.join(basedir, './fixtures/xml')
     xmlregex = r'ipg120327.one.xml'
     filelist = parse.list_files(testdir, xmlregex)
     grant_list = list(parse.parse_files(filelist))
     parsed_grants = list(parse.parse_patents(grant_list))
     self.assertTrue(len(parsed_grants) == len(grant_list)*len(xmlclasses))
 def test_list_files(self):
     testdir = os.path.join(basedir, './fixtures/xml')
     xmlregex = r'ipg120327.one.xml'
     files = parse.list_files(testdir, xmlregex)
     self.assertTrue(isinstance(files, list))
     self.assertTrue(len(files) == 1)
     self.assertTrue(all(filter(lambda x: isinstance(x, str), files)))
     self.assertTrue(all(map(lambda x: os.path.exists(x), files)))
 def test_list_files(self):
     testdir = os.path.join(basedir, './fixtures/xml')
     xmlregex = r'ipg120327.one.xml'
     files = parse.list_files(testdir, xmlregex)
     self.assertTrue(isinstance(files, list))
     self.assertTrue(len(files) == 1)
     self.assertTrue(all(filter(lambda x: isinstance(x, str), files)))
     self.assertTrue(all(map(lambda x: os.path.exists(x), files)))
Esempio n. 4
0
 def test_parse_patent(self):
     patentroot = '.'
     testdir = [os.path.join(basedir, './fixtures/xml')]
     xmlregex = r'ipg120327.one.xml'
     filelist = parse.list_files(testdir, patentroot, xmlregex)
     grant_list = parse.parallel_parse(filelist)
     parsed_grants = parse.parse_patent(grant_list)
     self.assertTrue(len(list(parsed_grants)) == len(grant_list)*len(xmlclasses))
Esempio n. 5
0
        urls += generate_download_list(parse_config['years'], 'grant')
    if should_process_applications:
        urls += generate_download_list(parse_config['years'], 'application')
    downloaddir = parse_config['downloaddir']
    if downloaddir and not os.path.exists(downloaddir):
        os.makedirs(downloaddir)
    print 'Downloading files at {0}'.format(str(datetime.datetime.today()))
    download_files(urls)
    print 'Downloaded files:',parse_config['years']
    f = datetime.datetime.now()
    print 'Finished downloading in {0}'.format(str(f-s))

    # find files
    print "Starting parse on {0} on directory {1}".format(str(datetime.datetime.today()),parse_config['datadir'])
    if should_process_grants:
        files = parse.list_files(parse_config['datadir'],parse_config['grantregex'])
        print 'Running grant parse...'
        run_parse(files, 'grant')
        f = datetime.datetime.now()
        print "Found {2} files matching {0} in directory {1}"\
                .format(parse_config['grantregex'], parse_config['datadir'], len(files))
    if should_process_applications:
        files = parse.list_files(parse_config['datadir'],parse_config['applicationregex'])
        print 'Running application parse...'
        run_parse(files, 'application')
        f = datetime.datetime.now()
        print "Found {2} files matching {0} in directory {1}"\
                .format(parse_config['applicationregex'], parse_config['datadir'], len(files))
    print 'Finished parsing in {0}'.format(str(f-s))

    # run extra phases if needed, then move output files
Esempio n. 6
0
import unittest

sys.path.append('..')
from couch_patent import *

sys.path.append('../lib/')
from patXML import *

import parse

basedir = os.path.join(os.curdir, '../test')
testdir = os.path.join(basedir, 'fixtures/xml/')
testfile = XMLPatentBase(open(testdir + 'ipg120327.one.xml').read())
patentroot = '.'
xmlregex = r'ipg120327.one.xml'
filelist = parse.list_files([testdir], patentroot, xmlregex)
grant_list = parse.parallel_parse(filelist)
parsed_grants = list(parse.parse_patent(grant_list))


class TestCouchPatent(unittest.TestCase):
    def setUp(self):
        self.assertTrue(testfile)

    def test_get_doc_metadata(self):
        """
        Tests that get_metadata retrieves the requisite information from a
        parsed xml file
        """
        metadata = get_metadata(parsed_grants[0])
        self.assertTrue(isinstance(metadata, dict))
Esempio n. 7
0
    if should_process_applications:
        urls += generate_download_list(parse_config['years'], 'application')
    downloaddir = parse_config['downloaddir']
    if downloaddir and not os.path.exists(downloaddir):
        os.makedirs(downloaddir)
    print 'Downloading files at {0}'.format(str(datetime.datetime.today()))
    download_files(urls)
    print 'Downloaded files:', parse_config['years']
    f = datetime.datetime.now()
    print 'Finished downloading in {0}'.format(str(f - s))

    # find files
    print "Starting parse on {0} on directory {1}".format(
        str(datetime.datetime.today()), parse_config['datadir'])
    if should_process_grants:
        files = parse.list_files(parse_config['datadir'],
                                 parse_config['grantregex'])
        print 'Running grant parse...'
        run_parse(files, 'grant')
        f = datetime.datetime.now()
        print "Found {2} files matching {0} in directory {1}"\
                .format(parse_config['grantregex'], parse_config['datadir'], len(files))
    if should_process_applications:
        files = parse.list_files(parse_config['datadir'],
                                 parse_config['applicationregex'])
        print 'Running application parse...'
        run_parse(files, 'application')
        f = datetime.datetime.now()
        print "Found {2} files matching {0} in directory {1}"\
                .format(parse_config['applicationregex'], parse_config['datadir'], len(files))
    print 'Finished parsing in {0}'.format(str(f - s))
Esempio n. 8
0
    # download the files to be parsed
    urls = generate_download_list(parse_config['years'])
    dview.scatter('urls', urls)
    # check download directory
    downloaddir = parse_config['downloaddir']
    if downloaddir and not os.path.exists(downloaddir):
        os.makedirs(downloaddir)
    dview['downloaddir'] = parse_config['downloaddir']
    dview.apply(download_files)
    print 'Downloaded files:',parse_config['years']
    f = datetime.datetime.now()
    print 'Finished downloading in {0}'.format(str(f-s))

    # find files
    print "Starting parse on {0} on directory {1}".format(str(datetime.datetime.today()),parse_config['datadir'])
    files = parse.list_files(parse_config['datadir'],parse_config['dataregex'])
    dview.scatter('files',files)
    print "Found {2} files matching {0} in directory {1}".format(parse_config['dataregex'], parse_config['datadir'], len(files))

    # run parse and commit SQL
    print 'Running parse...'
    inserts = list(itertools.chain.from_iterable(dview.apply(run_parse)))
    parse.commit_tables(inserts)
    f = datetime.datetime.now()
    print 'Finished parsing in {0}'.format(str(f-s))

    # run extra phases if needed, then move output files
    run_clean(process_config)
    run_consolidate(process_config)
    parse.move_tables(process_config['outputdir'])
import unittest

sys.path.append('..')
from couch_patent import *

sys.path.append('../lib/')
from patXML import *

import parse

basedir = os.path.join(os.curdir, '../test')
testdir = os.path.join(basedir, 'fixtures/xml/')
testfile = XMLPatentBase(open(testdir+'ipg120327.one.xml').read())
patentroot = '.'
xmlregex = r'ipg120327.one.xml'
filelist = parse.list_files([testdir], patentroot, xmlregex)
grant_list = parse.parallel_parse(filelist)
parsed_grants = list(parse.parse_patent(grant_list))

class TestCouchPatent(unittest.TestCase):

    def setUp(self):
        self.assertTrue(testfile)

    def test_get_doc_metadata(self):
        """
        Tests that get_metadata retrieves the requisite information from a
        parsed xml file
        """
        metadata = get_metadata(parsed_grants[0])
        self.assertTrue(isinstance(metadata, dict))