def savedoc(suborgid,orgid,sourceurl,documentdate,name,dochash,pdftext,tokens,orphaned): scrapedate = time.strftime('%Y-%m-%d') doc = documents() docid = doc.add(suborgid,orgid,sourceurl,documentdate,scrapedate,name,dochash,orphaned) doct = documenttexts() doct.add(docid,pdftext) wrds = words() for token,frequency in tokens.items(): if len(token) > 3: wrds.add(docid,suborgid,orgid,token,frequency) return docid
''' Created on Nov 3, 2011 @author: ashwani ''' import sys import getopt import parsing.xmlsaxparser as myparser2 from ngram import ngramsfreq from documents import documents #following class implement the processtext function ngramfreq = ngramsfreq() documents = documents() def processInputXml(filename, objectHandler): myparser2._SaxParser(filename, objectHandler).parsexml() class ProcessCorpus: def __init__(self, callerobject, basestorage): ''' self.function to invoke for parsing text callerobject implements method processtext. Currently there are two classes which does that documents class and ngramfreq class ''' self.__callerobject = callerobject self.__basestorage = basestorage def __call__(self, personalrecord = None): self.__callerobject.processtext(personalrecord) def getrecord(self, key): if self.__basestorage is None: return None