def __init__(self, queryString='', db_name='Psychology_abstracts'):
        global stopwords
        
        try:
            database= ld.databases[db_name]
        except:
            ld.getDatabases()
            database= ld.databases[db_name]
        
        
        ##this should go under loadtext in the future as it is db specific
        self.tf=pickle.load(open('tf.pkl', 'rb'))
        self.df=pickle.load(open('df.pkl', 'rb'))
        self.np_to_stem=pickle.load(open('np_to_stem.pkl', 'rb'))
        
        self.stem_to_np=defaultdict(list)
        for np, stem in self.np_to_stem.iteritems():
            self.stem_to_np[stem].append(np)
        
        
        
        noun_phrases=pickle.load(open('noun_phrases.pkl', 'rb'))
        self.noun_phrase_counts={}
        for k, v in noun_phrases.iteritems():
            self.noun_phrase_counts[str(k)]=Counter(v)
        
        
        noun_stems=pickle.load(open('noun_stems.pkl', 'rb'))
        self.noun_stem_counts={}
        for k, v in noun_stems.iteritems():
            self.noun_stem_counts[str(k)]=Counter(v)       
        
        
        
        
        
        self.database=database
        self.queryString=queryString
        
        
        db_doc=xapian.Database(self.database.xapian)    
        total_documents = db_doc.get_doccount()
        self.N=total_documents
        
        enquire = xapian.Enquire(db_doc)
        enquire.set_query(xapian.Query.MatchAll)
        matches = enquire.get_mset(0, total_documents)
        
        self.allDocs=[]
        for match in matches:
            # id=match.docid
            self.allDocs.append(self.database.get_doc(match.document.get_data()))
        
        stopper=SimpleStopper()

        for sword in stopwords:
            stopper.add(sword)
        self.stopper=stopper
    def __init__(self, queryString='', db_name='Psychology_abstracts'):
        global stopwords

        try:
            database = ld.databases[db_name]
        except:
            ld.getDatabases()
            database = ld.databases[db_name]

        ##this should go under loadtext in the future as it is db specific
        self.tf = pickle.load(open('tf.pkl', 'rb'))
        self.df = pickle.load(open('df.pkl', 'rb'))
        self.np_to_stem = pickle.load(open('np_to_stem.pkl', 'rb'))

        self.stem_to_np = defaultdict(list)
        for np, stem in self.np_to_stem.iteritems():
            self.stem_to_np[stem].append(np)

        noun_phrases = pickle.load(open('noun_phrases.pkl', 'rb'))
        self.noun_phrase_counts = {}
        for k, v in noun_phrases.iteritems():
            self.noun_phrase_counts[str(k)] = Counter(v)

        noun_stems = pickle.load(open('noun_stems.pkl', 'rb'))
        self.noun_stem_counts = {}
        for k, v in noun_stems.iteritems():
            self.noun_stem_counts[str(k)] = Counter(v)

        self.database = database
        self.queryString = queryString

        db_doc = xapian.Database(self.database.xapian)
        total_documents = db_doc.get_doccount()
        self.N = total_documents

        enquire = xapian.Enquire(db_doc)
        enquire.set_query(xapian.Query.MatchAll)
        matches = enquire.get_mset(0, total_documents)

        self.allDocs = []
        for match in matches:
            # id=match.docid
            self.allDocs.append(
                self.database.get_doc(match.document.get_data()))

        stopper = SimpleStopper()

        for sword in stopwords:
            stopper.add(sword)
        self.stopper = stopper
Beispiel #3
0
def build_stopwords(language, encoding="utf8"):
    file = StringIO(language.stopwords)
    stopwords = []
    for line in file.readlines():
        word = unicode(line.strip().split("|")[0].strip(), encoding)
        if word:
            stopwords.append(word)
    return stopwords

stopwords = {
    "en" : build_stopwords(english, encoding="latin-1"),
    "es" : build_stopwords(spanish, encoding="latin-1"),
    "ru" : build_stopwords(russian, encoding="koi8_r"),
    "fr" : build_stopwords(french, encoding="latin-1"),
    "de" : build_stopwords(german, encoding="latin-1"),
    "it" : build_stopwords(italian, encoding="latin-1"),
}

stoppers = {}

for code in stopwords:
    stopper = SimpleStopper()
    for word in stopwords[code]:
        stopper.add(word)
    stoppers[code] = stopper





Beispiel #4
0
from xapian import SimpleStopper
from . import english, spanish, french, german, italian, russian


def build_stopwords(language, encoding="utf8"):
    file = StringIO(language.stopwords)
    stopwords = []
    for line in file.readlines():
        word = unicode(line.strip().split("|")[0].strip(), encoding)
        if word:
            stopwords.append(word)
    return stopwords


stopwords = {
    "en": build_stopwords(english, encoding="latin-1"),
    "es": build_stopwords(spanish, encoding="latin-1"),
    "ru": build_stopwords(russian, encoding="koi8_r"),
    "fr": build_stopwords(french, encoding="latin-1"),
    "de": build_stopwords(german, encoding="latin-1"),
    "it": build_stopwords(italian, encoding="latin-1"),
}

stoppers = {}

for code in stopwords:
    stopper = SimpleStopper()
    for word in stopwords[code]:
        stopper.add(word)
    stoppers[code] = stopper