Example #1
0
def savedoc(suborgid,orgid,sourceurl,documentdate,name,dochash,pdftext,tokens,orphaned):
    scrapedate = time.strftime('%Y-%m-%d')
    doc = documents()
    docid = doc.add(suborgid,orgid,sourceurl,documentdate,scrapedate,name,dochash,orphaned)
    doct = documenttexts()
    doct.add(docid,pdftext)
    wrds = words()
    for token,frequency in tokens.items():
        if len(token) > 3:
            wrds.add(docid,suborgid,orgid,token,frequency)
    return docid
Example #2
0
'''
Created on Nov 3, 2011

@author: ashwani
'''
import sys
import getopt
import parsing.xmlsaxparser as myparser2
from ngram import ngramsfreq
from documents import documents
#following class implement the processtext function
ngramfreq = ngramsfreq()
documents = documents()
def processInputXml(filename, objectHandler):
    myparser2._SaxParser(filename, objectHandler).parsexml()
    
class ProcessCorpus:
    def __init__(self, callerobject, basestorage):
        '''
        self.function to invoke for parsing text
        callerobject implements method processtext. Currently there are two
        classes which does that documents class and ngramfreq class
        '''
        self.__callerobject = callerobject
        self.__basestorage =  basestorage
    def __call__(self, personalrecord = None):
        self.__callerobject.processtext(personalrecord)
        
    def getrecord(self, key):
        if self.__basestorage is None:
            return None