Ejemplo n.º 1
0
def _get_parser(date, doctype='grant'):
    """
    Given a [date], returns the class of parser needed
    to parse it
    """
    xmlhandlers = get_xml_handlers('process.cfg', doctype)
    for daterange in xmlhandlers.iterkeys():
        if daterange[0] <= date <= daterange[1]:
            return xmlhandlers[daterange]
    return xmlhandlers['default']
Ejemplo n.º 2
0
def _get_parser(date, doctype='grant'):
    """
    Given a [date], returns the class of parser needed
    to parse it
    """
    xmlhandlers = get_xml_handlers('process.cfg', doctype)
    for daterange in xmlhandlers.iterkeys():
        if daterange[0] <= date <= daterange[1]:
            return xmlhandlers[daterange]
    return xmlhandlers['default']
Ejemplo n.º 3
0
import re
import mmap
import contextlib
import itertools
import sys
import lib.handlers.grant_handler as grant_handler
import lib.patSQL as patSQL
import lib.argconfig_parse as argconfig_parse
from lib.config_parser import get_xml_handlers

xmlclasses = [patSQL.AssigneeXML, patSQL.CitationXML, patSQL.ClassXML, \
              patSQL.InventorXML, patSQL.PatentXML, patSQL.PatdescXML, \
              patSQL.LawyerXML, patSQL.ScirefXML, patSQL.UsreldocXML]

regex = re.compile(r"""([<][?]xml version.*?[>]\s*[<][!]DOCTYPE\s+([A-Za-z-]+)\s+.*?/\2[>])""", re.S+re.I)
xmlhandlers = get_xml_handlers('process.cfg')

def list_files(patentroot, xmlregex):
    """
    Returns listing of all files within patentroot
    whose filenames match xmlregex
    """
    files = [patentroot+'/'+fi for fi in os.listdir(patentroot) \
            if re.search(xmlregex, fi, re.I) != None]
    if not files:
        logging.error("No files matching {0} found in {1}".format(XMLREGEX,PATENTROOT))
        sys.exit(1)
    return files

def _get_date(filename, dateformat='ipg%y%m%d.xml'):
    """