def __init__(self, queryString='', db_name='Psychology_abstracts'): global stopwords try: database= ld.databases[db_name] except: ld.getDatabases() database= ld.databases[db_name] ##this should go under loadtext in the future as it is db specific self.tf=pickle.load(open('tf.pkl', 'rb')) self.df=pickle.load(open('df.pkl', 'rb')) self.np_to_stem=pickle.load(open('np_to_stem.pkl', 'rb')) self.stem_to_np=defaultdict(list) for np, stem in self.np_to_stem.iteritems(): self.stem_to_np[stem].append(np) noun_phrases=pickle.load(open('noun_phrases.pkl', 'rb')) self.noun_phrase_counts={} for k, v in noun_phrases.iteritems(): self.noun_phrase_counts[str(k)]=Counter(v) noun_stems=pickle.load(open('noun_stems.pkl', 'rb')) self.noun_stem_counts={} for k, v in noun_stems.iteritems(): self.noun_stem_counts[str(k)]=Counter(v) self.database=database self.queryString=queryString db_doc=xapian.Database(self.database.xapian) total_documents = db_doc.get_doccount() self.N=total_documents enquire = xapian.Enquire(db_doc) enquire.set_query(xapian.Query.MatchAll) matches = enquire.get_mset(0, total_documents) self.allDocs=[] for match in matches: # id=match.docid self.allDocs.append(self.database.get_doc(match.document.get_data())) stopper=SimpleStopper() for sword in stopwords: stopper.add(sword) self.stopper=stopper
def __init__(self, queryString='', db_name='Psychology_abstracts'): global stopwords try: database = ld.databases[db_name] except: ld.getDatabases() database = ld.databases[db_name] ##this should go under loadtext in the future as it is db specific self.tf = pickle.load(open('tf.pkl', 'rb')) self.df = pickle.load(open('df.pkl', 'rb')) self.np_to_stem = pickle.load(open('np_to_stem.pkl', 'rb')) self.stem_to_np = defaultdict(list) for np, stem in self.np_to_stem.iteritems(): self.stem_to_np[stem].append(np) noun_phrases = pickle.load(open('noun_phrases.pkl', 'rb')) self.noun_phrase_counts = {} for k, v in noun_phrases.iteritems(): self.noun_phrase_counts[str(k)] = Counter(v) noun_stems = pickle.load(open('noun_stems.pkl', 'rb')) self.noun_stem_counts = {} for k, v in noun_stems.iteritems(): self.noun_stem_counts[str(k)] = Counter(v) self.database = database self.queryString = queryString db_doc = xapian.Database(self.database.xapian) total_documents = db_doc.get_doccount() self.N = total_documents enquire = xapian.Enquire(db_doc) enquire.set_query(xapian.Query.MatchAll) matches = enquire.get_mset(0, total_documents) self.allDocs = [] for match in matches: # id=match.docid self.allDocs.append( self.database.get_doc(match.document.get_data())) stopper = SimpleStopper() for sword in stopwords: stopper.add(sword) self.stopper = stopper
def build_stopwords(language, encoding="utf8"): file = StringIO(language.stopwords) stopwords = [] for line in file.readlines(): word = unicode(line.strip().split("|")[0].strip(), encoding) if word: stopwords.append(word) return stopwords stopwords = { "en" : build_stopwords(english, encoding="latin-1"), "es" : build_stopwords(spanish, encoding="latin-1"), "ru" : build_stopwords(russian, encoding="koi8_r"), "fr" : build_stopwords(french, encoding="latin-1"), "de" : build_stopwords(german, encoding="latin-1"), "it" : build_stopwords(italian, encoding="latin-1"), } stoppers = {} for code in stopwords: stopper = SimpleStopper() for word in stopwords[code]: stopper.add(word) stoppers[code] = stopper
from xapian import SimpleStopper from . import english, spanish, french, german, italian, russian def build_stopwords(language, encoding="utf8"): file = StringIO(language.stopwords) stopwords = [] for line in file.readlines(): word = unicode(line.strip().split("|")[0].strip(), encoding) if word: stopwords.append(word) return stopwords stopwords = { "en": build_stopwords(english, encoding="latin-1"), "es": build_stopwords(spanish, encoding="latin-1"), "ru": build_stopwords(russian, encoding="koi8_r"), "fr": build_stopwords(french, encoding="latin-1"), "de": build_stopwords(german, encoding="latin-1"), "it": build_stopwords(italian, encoding="latin-1"), } stoppers = {} for code in stopwords: stopper = SimpleStopper() for word in stopwords[code]: stopper.add(word) stoppers[code] = stopper