def run_parse(files, doctype='grant'): import parse import time import sys import itertools import lib.alchemy as alchemy import logging logfile = "./" + 'xml-parsing.log' logging.basicConfig(filename=logfile, level=logging.DEBUG) parse.parse_files(files, doctype)
def save_rust_repo(srcpath, **kwargs): 'save rust and formats to database' import parse path = os.path.join(srcpath, 'sourcefiles.txt') with open(path, 'rU') as ifile: srcfiles = [os.path.join(srcpath, s.strip()) for s in ifile] tree = parse.parse_files(srcfiles) docs = extract_docs(tree) insert_docs(docs, **kwargs) print 'inserted %d documents' % len(docs) path = os.path.join(srcpath, 'formats.rst') tree = parse.parse_files([path]) formats = extract_formats(tree) insert_formats(formats, **kwargs) print 'inserted %d formats' % len(formats)
def test_parse_patent(self): testdir = os.path.join(basedir, './fixtures/xml') xmlregex = r'ipg120327.one.xml' filelist = parse.list_files(testdir, xmlregex) grant_list = list(parse.parse_files(filelist)) parsed_grants = list(parse.parse_patents(grant_list)) self.assertTrue(len(parsed_grants) == len(grant_list)*len(xmlclasses))
def __init__(self, name, output_path): self.name = name self.output_path = output_path self.files = glob(oce_include + "/" + name + "_*.hxx") self.files = filter(lambda h: not ignore(h), self.files) self.ns = parse.parse_files(oce_include, self.files)
def test_use_parse_files_one(self): filelist = [testdir+testfileone] parsed_output = list(parse.parse_files(filelist)) patobj = PatentGrant(parsed_output[0][1], True) parsed_xml = [xmlclass(patobj) for xmlclass in xmlclasses] self.assertTrue(len(parsed_xml) == len(xmlclasses)) self.assertTrue(all(parsed_xml))
def test_parse_files_one(self): filelist = [testdir+testfileone] parsed_output = parse.parse_files(filelist) self.assertTrue(isinstance(parsed_output,Iterable)) parsed_output = list(parsed_output) self.assertTrue(len(parsed_output) == 1) self.assertTrue(isinstance(parsed_output[0], tuple)) self.assertTrue(isinstance(parsed_output[0][1], str)) self.assertTrue(regex.match(parsed_output[0][1]))
def run_parse(): import parse import time import sys import itertools parsed_xmls = parse.parse_files(files) parsed_grants = parse.parse_patents(parsed_xmls) parse.build_tables(parsed_grants) return parse.get_inserts()
def test_use_parse_files_two(self): filelist = [testdir+testfiletwo] parsed_output = parse.parse_files(filelist) parsed_xml = [] for us_patent_grant in parsed_output: self.assertTrue(isinstance(us_patent_grant, tuple)) self.assertTrue(isinstance(us_patent_grant[1], str)) patobj = grant_handler_v42.PatentGrant(us_patent_grant[1], True) self.assertTrue(patobj)
def test_use_parse_files_two(self): filelist = [testdir + testfiletwo] parsed_output = parse.parse_files(filelist) parsed_xml = [] for us_patent_grant in parsed_output: self.assertTrue(isinstance(us_patent_grant, tuple)) self.assertTrue(isinstance(us_patent_grant[1], str)) patobj = grant_handler_v42.PatentGrant(us_patent_grant[1], True) self.assertTrue(patobj)
def test_parse_files_one(self): filelist = [testdir + testfileone] parsed_output = parse.parse_files(filelist) self.assertTrue(isinstance(parsed_output, Iterable)) parsed_output = list(parsed_output) self.assertTrue(len(parsed_output) == 1) self.assertTrue(isinstance(parsed_output[0], tuple)) self.assertTrue(isinstance(parsed_output[0][1], str)) self.assertTrue(regex.match(parsed_output[0][1]))
def test_use_parse_files_two(self): filelist = [testdir+testfiletwo] parsed_output = parse.parse_files(filelist) parsed_xml = [] for us_patent_grant in parsed_output: self.assertTrue(isinstance(us_patent_grant, tuple)) self.assertTrue(isinstance(us_patent_grant[1], str)) patobj = PatentGrant(us_patent_grant[1], True) for xmlclass in xmlclasses: parsed_xml.append(xmlclass(patobj)) self.assertTrue(len(parsed_xml) == 2 * len(xmlclasses)) self.assertTrue(all(parsed_xml))
def scrape(path): pyfiles = [] dirs = [path] while len(dirs) != 0: d = dirs.pop(-1) for rel in os.listdir(d): f = os.path.join(d, rel) if os.path.islink(f): continue elif os.path.isdir(f): dirs.append(f) elif os.path.isfile(f) and f.endswith('.py'): pyfiles.append(f) return parse_files(pyfiles)
def save_concept_lessons_csv(ctfiles, csvfile, func=flag_rst_images, blocks=(':warning:', ':comment:', ':derivation:', ':intro:', ':informal-definition:', ':formal-definition:')): tree = parse.parse_files(ctfiles) with codecs.open(csvfile, 'w', encoding='utf-8') as ofile: writer = csv.writer(ofile) for error in defaultErrorModels: save_generic_error(error, writer) for i, lesson in enumerate(tree.walk()): # assign node IDs lesson.nodeID = i for lesson in tree.walk(): if not hasattr(lesson, 'tokens'): continue metadata = lesson.metadata_dict() if 'fallacy' in metadata.get('conceptType', ()) \ or 'violates' in metadata: save_concept_error(lesson, metadata, writer, func) continue # do not generate concept links elif lesson.tokens[0] == 'section': save_section_csv(lesson, metadata, writer, func) elif lesson.tokens[0] in blocks: if len(lesson.tokens) >= 2: save_section_csv(lesson, metadata, writer, func, lesson.tokens[0][1:-1], lesson.tokens[1]) elif lesson.parent and getattr(lesson.parent, 'conceptID', 0): save_section_csv(lesson, metadata, writer, func, lesson.tokens[0][1:-1], lesson.parent.conceptID) elif lesson.tokens[0] == ':question:': if is_multipart_question(lesson): save_section_csv(lesson, metadata, writer, func) else: save_question_csv2(lesson, writer, func) for relation, conceptID in get_concept_links(metadata): if relation == 'defines': lesson.conceptID = conceptID writer.writerow(('conceptlink', lesson.nodeID, relation, conceptID))
def test_use_parse_files_one(self): filelist = [testdir+testfileone] parsed_output = list(parse.parse_files(filelist)) patobj = grant_handler_v42.PatentGrant(parsed_output[0][1], True) self.assertTrue(patobj)
def test_use_parse_files_one(self): filelist = [testdir + testfileone] parsed_output = list(parse.parse_files(filelist)) patobj = grant_handler_v42.PatentGrant(parsed_output[0][1], True) self.assertTrue(patobj)