def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() # Prepare analyzers # - standard analyzer simply splits words # - stemming extracts stems, to catch things like plurals analyzers = [ (SimpleAnalyzer(), True), (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True), (StandardAnalyzer(), False), (StemmingAnalyzer(), False), ] source_language = unit.translation.subproject.project.source_language lang_code = source_language.base_code() # Add per language analyzer if Whoosh has it if has_stemmer(lang_code): analyzers.append((LanguageAnalyzer(lang_code), False)) # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append((NgramAnalyzer(4), False)) # Extract words from all plurals and from context for text in unit.get_source_plurals() + [unit.context]: for analyzer, combine in analyzers: # Some Whoosh analyzers break on unicode new_words = [] try: new_words = [token.text for token in analyzer(text)] except (UnicodeDecodeError, IndexError) as error: report_error(error, sys.exc_info()) words.update(new_words) # Add combined string to allow match against multiple word # entries allowing to combine up to 5 words if combine: words.update([ ' '.join(new_words[x:y]) for x in range(len(new_words)) for y in range(1, min(x + 6, len(new_words) + 1)) if x != y ]) # Grab all words in the dictionary dictionary = self.filter(project=unit.translation.subproject.project, language=unit.translation.language) if '' in words: words.remove('') if len(words) == 0: # No extracted words, no dictionary dictionary = dictionary.none() else: # Build the query for fetching the words # Can not use __in as we want case insensitive lookup dictionary = dictionary.filter(source__iregex=r'^({0})$'.format( '|'.join([re_escape(word) for word in words]))) return dictionary
class Question(db.Model, BaseMixin, DateTimeMixin): __tablename__ = 'questions' __searchable__ = ['title'] __analyzer__ = SimpleAnalyzer() id = db.Column(db.Integer, primary_key=True) title = db.Column(db.String(64)) # 问题名称 description = db.Column(db.Text) # 问题描述 author_id = db.Column(db.Integer, db.ForeignKey('users.id')) # 提出问题的人 topics = db.relationship('QuestionTopic', backref='question', lazy='dynamic', foreign_keys=[QuestionTopic.question_id]) # 属于的话题 favorites = db.relationship('QuestionFavorite', backref='question', lazy='dynamic', foreign_keys=[QuestionFavorite.question_id], cascade='all,delete-orphan') answers = db.relationship('Answer', backref='question', lazy='dynamic') # 回答者 browse_count = db.Column(db.Integer, default=0) # 问题被浏览了多少次 anonymous = db.Column(db.Boolean, default=False) # 是否匿名提问 disable_comment = db.Column(db.Boolean, default=False) # 是否禁止评论 comments = db.relationship('Comment', backref='question', lazy='dynamic') followers = db.relationship('FollowQuestion', backref=db.backref('followed', lazy='joined'), lazy='dynamic', foreign_keys=[FollowQuestion.followed_id], cascade='all,delete-orphan') answers_count = db.Column(db.Integer) comments_count = db.Column(db.Integer) followers_count = db.Column(db.Integer) def disable(self): self.disable_comment = True db.session.add(self) @hybrid_property def browsed(self): return self.browse_count @browsed.setter def browsed(self, val): self.browse_count = val db.session.add(self) db.session.commit() def is_followed_by(self, user): return self.followers.filter_by( follower_id=user.id).first() is not None @property def q_topics(self): return [i.topic for i in self.topics.all()] @property def undelete_comments(self): return self.comments.filter(Comment.was_delete == False) def __repr__(self): return '<Question {}>'.format(self.title)
def tokenize_query(self, query, use_concepts=False): """tokenize query""" analyzer = SimpleAnalyzer() if use_concepts: # uppercase tokens since CUIs are in uppercase return [token.text.upper() for token in analyzer(query)] else: return [token.text for token in analyzer(query)]
def index(index_dir, final_file): print("\trun index...") analyzer = SimpleAnalyzer(expression=r"[\w,.\"\\\-:\'_ ]+") schema = Schema(names=TEXT(stored=True), data=STORED, films=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists(index_dir): os.mkdir(index_dir) ix = create_in(index_dir, schema) writer_actor = ix.writer() with gzip.open(final_file, 'rb') as f: count = 0 for line in f: decode_line = line.decode("utf-8") count += 1 array_line = handler.return_array(decode_line) names = array_line[0].split( '\t') if array_line[0] != "NONE" else [] aliases = array_line[1].split( '\t') if array_line[1] != "NONE" else [] writer_actor.add_document(names="\t".join(names + aliases), data=array_line[2:4], films="@".join(array_line[4:])) print("\t\twrite index...") writer_actor.commit() del writer_actor
def initialize_index(self): results = db.cachedb.execute_and_fetchall( "SELECT card_id, fields FROM card_index_keywords") schema = Schema(title=ID(stored=True), idolized=BOOLEAN, short=TEXT, owned=NUMERIC, chara=TEXT, rarity=TEXT, color=TEXT, skill=TEXT, carnival=TEXT, leader=TEXT, fes=TEXT, noir=TEXT, blanc=TEXT, main_attribute=TEXT, time_prob_key=TEXT, content=TEXT(analyzer=SimpleAnalyzer())) ix = create_in(INDEX_PATH, schema) writer = ix.writer() logger.debug("Initializing quicksearch index for {} cards".format( len(results))) for result in results: fields = ast.literal_eval(result[1]) content = " ".join([fields[key] for key in KEYWORD_KEYS_STR_ONLY]) writer.add_document(title=str(result[0]), content=content, **fields) writer.commit() self.index = ix logger.debug("Quicksearch index initialized for {} cards".format( len(results)))
class Category(db.Model): __tablename__ = "category" __searchable__ = ['title', 'content'] __analyzer__ = SimpleAnalyzer() id = db.Column(db.Integer(), primary_key=True, nullable=False) user = db.Column(db.Integer(), db.ForeignKey('users.id'), nullable=False) title = db.Column(db.String(30), nullable=False) content = db.Column(db.Text(100000), nullable=False) update_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow()) collect_num = db.Column(db.Integer(), default=0, nullable=False) commented = db.relationship( 'Comment', # 记录文章的评论数 backref=db.backref('post', lazy='joined'), lazy='dynamic', cascade='all, delete-orphan') disabled = db.Column(db.Boolean, nullable=False, default=False) favorite = db.relationship( 'Favorite', # 关注这篇文章的人 foreign_keys=[Favorite.favorited_id], backref=db.backref('favorited', lazy='joined'), lazy='dynamic', cascade='all, delete-orphan') def __repr__(self): data = {'title': self.title, 'content': self.content} return str(data)
def index(self, index_name='unified'): types = self.get_requested_content_types() from whoosh.fields import TEXT, ID, NGRAM, NUMERIC from whoosh.analysis import StemmingAnalyzer, SimpleAnalyzer, IDAnalyzer from whoosh.analysis.filters import LowercaseFilter simp_ana = SimpleAnalyzer() print 'Building %s index...' % index_name # build a single schema from the fields exposed by the different search # types print '\tSchema:' fields = {} for type in types: for info in type.get_fields_info().values(): if info['whoosh']['name'] not in fields and not info[ 'whoosh'].get('ignore', False): print '\t\t%s' % info field_type = info['whoosh']['type'] if index_name == 'autocomplete': # break the long text fields into terms, leave the # others as single expression if not (field_type.__class__ == NUMERIC): if info.get('long_text', False): field_type = TEXT(analyzer=simp_ana) else: field_type = ID(stored=True, analyzer=IDAnalyzer() | LowercaseFilter()) print '\t\t%s' % field_type fields[info['whoosh']['name']] = field_type # JIRA 508 - Add an ID counterpart to allow exact phrase search # if info.get('long_text', False): # fields[info['whoosh']['name']+'_iexact'] = ID(analyzer=IDAnalyzer(lowercase=True)) from whoosh.fields import Schema schema = Schema(**fields) # Create the index schema index = self.recreate_index(index_name, schema) # Add documents to the index print '\tWrite indexes:' writer = index.writer() aci = {} for type in types: count = type.write_index(writer, self.is_verbose(), aci) print '\t\t%s %s records indexed' % (count, type.get_model().__name__) # autocomplete if index_name == 'unified': f = open(types[0].get_autocomplete_path(True), 'w') f.write((ur'|'.join(aci.keys())).encode('utf8')) f.close() writer.commit()
def _mk_schema(self, dsinfo): from whoosh import fields as wf from whoosh.analysis import SimpleAnalyzer # haven for terms that have been found to be undefined # (for faster decision-making upon next encounter) # this will harvest all discovered term definitions definitions = { '@id': 'unique identifier of an entity', # TODO make proper JSON-LD definition 'path': 'path name of an entity relative to the searched base dataset', # TODO make proper JSON-LD definition 'parentds': 'path of the datasets that contains an entity', # 'type' will not come from a metadata field, hence will not be detected 'type': 'type of a record', } schema_fields = { n.lstrip('@'): wf.ID(stored=True, unique=n == '@id') for n in definitions } lgr.debug('Scanning for metadata keys') # quick 1st pass over all dataset to gather the needed schema fields log_progress( lgr.info, 'idxschemabuild', 'Start building search schema', total=len(dsinfo), label='Building search schema', unit=' Datasets', ) for res in query_aggregated_metadata( # XXX TODO After #2156 datasets may not necessarily carry all # keys in the "unique" summary reporton='datasets', ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], recursive=True): meta = res.get('metadata', {}) # no stringification of values for speed, we do not need/use the # actual values at this point, only the keys idxd = _meta2autofield_dict(meta, val2str=False) for k in idxd: schema_fields[k] = wf.TEXT(stored=False, analyzer=SimpleAnalyzer()) log_progress(lgr.info, 'idxschemabuild', 'Scanned dataset at %s', res['path'], update=1, increment=True) log_progress(lgr.info, 'idxschemabuild', 'Done building search schema') self.schema = wf.Schema(**schema_fields)
class Topic(db.Model, BaseMixin, DateTimeMixin): __tablename__ = 'topics' __analyzer__ = SimpleAnalyzer() __searchable__ = ['title'] id = db.Column(db.Integer, primary_key=True) title = db.Column(db.String(64)) # 话题名称 description = db.Column(db.Text) # 话题描述 cover_url = db.Column(db.String(256)) # 封面图片 cover_url_sm = db.Column(db.String(256)) # 封面缩略图片 author_id = db.Column(db.Integer, db.ForeignKey('users.id')) # 创建人 followers = db.relationship('FollowTopic', backref=db.backref('followed', lazy='joined'), lazy='dynamic', foreign_keys=[FollowTopic.followed_id], cascade='all,delete-orphan') questions = db.relationship('QuestionTopic', backref='topic', lazy='dynamic', foreign_keys=[QuestionTopic.topic_id]) # 包括的问题 follower_count = db.Column(db.Integer) questions_count = db.Column(db.Integer) def __repr__(self): return '<Topic {}>'.format(self.title) def add_question(self, question): if not self.is_in_topic(question): QuestionTopic.create(topic=self, question=question) return True return False def remove_question(self, question): f = self.questions.filter_by(question_id=question.id).first() if f: db.session.delete(f) db.session.commit() def is_in_topic(self, question): return self.questions.filter_by( question_id=question.id).first() is not None @classmethod def topic_exists(cls, title): return cls.query.filter_by(title=title).first() is not None @classmethod def generate_topics(cls): for each in topics: Topic.create(title=each, description='话题描述') def is_followed_by(self, user): return self.followers.filter_by( follower_id=user.id).first() is not None
def get_words(self, unit): """Return list of word pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - simple analyzer just splits words based on regexp # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError) as error: report_error(error) if len(words) > 1000: break if len(words) > 1000: break if '' in words: words.remove('') if not words: # No extracted words, no dictionary return self.none() # Build the query for fetching the words # We want case insensitive lookup return self.filter( project=unit.translation.component.project, language=unit.translation.language, source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format( '|'.join(re_escape(word) for word in islice(words, 1000))), )
def exec_comp(): ''' Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration ''' #text analyzers selected_analyzers = [ StemmingAnalyzer(), SimpleAnalyzer(), StandardAnalyzer(), RegexAnalyzer(), FancyAnalyzer(), NgramAnalyzer(5), KeywordAnalyzer(), LanguageAnalyzer('en') ] #text analyzers sel_ana = [ 'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()', 'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)', 'KeywordAnalyzer()', 'LanguageAnalyzer()' ] #text which will be used for graph and for mrr table i = 0 #counter mrrs = [] #list where MRR values for each SE configuration will be stored #scoring functions scoring_functions = [ scoring.TF_IDF(), scoring.Frequency(), scoring.BM25F(B=0.75, content_B=1.0, K1=1.5) ] scor_func = [' TF_IDF', ' Frequency', ' BM25F'] #ground truth gt1 = pd.read_csv(os.getcwd() + "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv", sep='\t') #combinations for every chosen analyzer with every chosen scoring function for x in range(len(selected_analyzers)): for y in range(len(scoring_functions)): print(sel_ana[x] + scor_func[y]) i = i + 1 sr_1 = exec_queries( selected_analyzers[x], scoring_functions[y] ) # execute queries for the chosen configuration combination sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv", index=False) #save results of the search engine mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1, sr_1))) #calculate MRR mrrs_saving = pd.DataFrame(mrrs) mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv", index=False) #store MRR table
class BlogPost(db.Model): __tablename__ = 'user' __searchable__ = ['name', 'email'] # these fields will be indexed by whoosh __analyzer__ = SimpleAnalyzer() # configure analyzer; defaults to # StemmingAnalyzer if not specified id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String) # Indexed fields are either String, password = db.Column(db.String) # Unicode, or Text email = db.Column(db.String) # Unicode, or Text created = db.Column(db.DateTime, default=datetime.datetime.utcnow) status = db.Column(db.Integer)
def new_searching_field(self, field_name, field_data): """ Add a new searching field. It will be used by the search engine recommender. If the schema is not yet defined the writer will add the field_name inside the schema Args: field_name (str): Name of the new field field_data: Data to put into the field """ if not self.__schema_defined: self.__writer.add_field( field_name, TEXT(stored=True, analyzer=SimpleAnalyzer())) self.__doc[field_name] = field_data
class NewsSummary(Summary): __searchable__ = ['bullets'] __analyzer__ = SimpleAnalyzer() __mapper_args__ = { 'polymorphic_identity': 'newssummary', 'inherit_condition': (id == Summary.id) } title = db.Column(db.String) pub_date = db.Column(db.DateTime) image_path = db.Column(db.String(80)) news_source_id = db.Column(db.Integer, db.ForeignKey('news_source.id')) news_source = db.relationship('NewsSource', backref=db.backref('summaries', lazy='dynamic')) news_category_id = db.Column(db.Integer, db.ForeignKey('news_category.id')) news_category = db.relationship('NewsCategory', backref=db.backref('summaries', lazy='dynamic')) def __init__( self, title, bullets, highlighted_text, news_source, source_url, news_category, date_added=None, pub_date=None, image_path='', ): super(NewsSummary, self).__init__(bullets, highlighted_text, source_url=source_url, date_added=date_added) self.title = title self.news_source = news_source self.news_category = news_category self.pub_date = pub_date if image_path: self.image_path = image_path else: self.image_path = news_source.image_path def __repr__(self): return '<NewsSummary {0}>'.format(self.title)
def _schema(self): # Creates a schema given this object's mingram and maxgram attributes. from whoosh.fields import Schema, FieldType, ID, STORED from whoosh.formats import Frequency from whoosh.analysis import SimpleAnalyzer idtype = ID() freqtype = FieldType(Frequency(), SimpleAnalyzer()) fls = [("word", STORED), ("score", STORED)] for size in xrange(self.mingram, self.maxgram + 1): fls.extend([("start%s" % size, idtype), ("end%s" % size, idtype), ("gram%s" % size, freqtype)]) return Schema(**dict(fls))
def get_syns(self, term2cui, term_dict): """get synonymic relations between words within corpus (derived from a semantic lexicon)""" syns = {} umls_lookup = umls.UMLSLookup() analyzer = SimpleAnalyzer() for term, cui in term2cui.items(): if term in term_dict: if cui != '__NULL__': # get synset composed of single-word terms (reference term excluded) synset = {syn[0].lower() for syn in umls_lookup.lookup_synonyms(cui, preferred=False) if len(list(analyzer(syn[0]))) == 1 and syn[0].lower() in term_dict and syn[0].lower() != term} if len(synset) > 0: syns[term] = list(synset) else: syns[term] = list() else: syns[term] = list() return syns
class Answer(db.Model, BaseMixin, DateTimeMixin): __tablename__ = 'answers' __searchable__ = ['body'] __analyzer__ = SimpleAnalyzer() id = db.Column(db.Integer, primary_key=True) body = db.Column(db.Text) # 回答详情 author_id = db.Column(db.Integer, db.ForeignKey('users.id')) # 作者 question_id = db.Column(db.Integer, db.ForeignKey('questions.id')) # 属于哪个问题 favorites = db.relationship('AnswerFavorite', lazy='dynamic', backref='answer', foreign_keys=[AnswerFavorite.answer_id], cascade='all,delete-orphan') anonymous = db.Column(db.Boolean, default=False) # 作者是否匿名回答 disable_comment = db.Column(db.Boolean, default=False) # 是否禁止评论 comments = db.relationship('Comment', backref='answer', lazy='dynamic') liked_answers = db.relationship('LikeAnswer', backref=db.backref('answer_liked', lazy='joined'), lazy='dynamic', foreign_keys=[LikeAnswer.answer_liked_id], cascade='all,delete-orphan') liked_count = db.Column(db.Integer, default=0) # 以点赞数排序 comments_count = db.Column(db.Integer) def count_ping(self): """点赞,取消赞之后调用""" self.liked_count = self.liked_answers.count() db.session.add(self) db.session.commit() def is_liked_by(self, user): return self.liked_answers.filter_by( like_answer_id=user.id).first() is not None def disable(self): self.disable_comment = True db.session.add(self) @property def undelete_comments(self): return self.comments.filter(Comment.was_delete == False) def __repr__(self): return '<Answer {}>'.format(self.id)
def test_custom_analyzer(self): from whoosh.analysis import SimpleAnalyzer self.app.config['WHOOSH_ANALYZER'] = SimpleAnalyzer() db.init_app(self.app) db.create_all() db.session.add(ObjectA(title='jumping', content='')) db.session.commit() assert not list(ObjectA.query.whoosh_search('jump')) assert ['jumping'] == [obj.title for obj in ObjectA.query.whoosh_search('jumping')] db.session.add(ObjectD(title='Travelling', content='Stemming')) db.session.add(ObjectD(title='travel', content='Unstemmed and normal')) db.session.add(ObjectD(title='trevel', content='Mispelt')) db.session.commit() # When mispelt on either the indexed side or the query side, they should all return 3 due to the DoubleMetaphoneFilter self.assertEqual(len(list(ObjectD.query.whoosh_search('travelling'))), 3) self.assertEquals(len(list(ObjectD.query.whoosh_search('trovel'))), 3)
def get_syns(term2cui, word_dict, umls_lookup): """get synonymy relations from corpus and lexicon as a dictionary""" syns = {} analyzer = SimpleAnalyzer() for term, cui in term2cui.items(): if cui != "__NULL__": synset = { word_dict[syn[0].lower()] for syn in umls_lookup.lookup_synonyms(cui, preferred=False) if len(list(analyzer(syn[0]))) == 1 and syn[0].lower() in word_dict and syn[0].lower() != term } if len(synset) > 0: syns[word_dict[term]] = list(synset) else: syns[word_dict[term]] = [] else: syns[word_dict[term]] = [] return syns
def initialize_chart_index(self): results = db.cachedb.execute_and_fetchall( "SELECT live_detail_id, performers, special_keys, jp_name, name, level, color, difficulty FROM live_detail_cache" ) schema = Schema(title=ID(stored=True), live_detail_id=NUMERIC, performers=TEXT, special_keys=TEXT, jp_name=TEXT, name=TEXT, difficulty=TEXT, level=NUMERIC, color=TEXT, content=TEXT(analyzer=SimpleAnalyzer())) ix = create_in(INDEX_PATH, schema, indexname="score") writer = ix.writer() logger.debug("Initializing quicksearch index for {} charts".format( len(results))) for result in results: difficulty = Difficulty(result[-1]).name.lower() performers = result[1].replace(",", "") if result[1] else "" color = Color(result[6] - 1).name.lower() content = " ".join([ performers, result[2] if result[2] else "", result[3], result[4], difficulty, color, str(result[5]) ]) writer.add_document( title=str(result[0]), content=content, live_detail_id=result[0], performers=performers, special_keys=result[2], jp_name=result[3], name=result[4], level=result[5], color=color, difficulty=difficulty, ) writer.commit() self.song_index = ix logger.debug("Quicksearch index initialized for {} charts".format( len(results)))
def open_index(indexdir, incremental=False): """ Opens the index with the given name. If the directory or the index do not yet exist, the are created. @type indexdir: str @param indexdir: The name of the index directory. @type incremental: bool @param incremental: Whether to preserve existing index content. @rtype: whoosh.Index @return: An object representing the index. """ if not os.path.exists(indexdir): os.makedirs(indexdir) if incremental and index.exists_in(indexdir): return index.open_dir(indexdir) schema = Schema(number=NUMERIC(stored=True), filename=ID(stored=True), line=TEXT(analyzer=SimpleAnalyzer(), stored=True)) return index.create_in(indexdir, schema)
class Article(db.Model): __tablename__ = 'article' __searchable__ = ['title', 'content'] # 搜索字段 __analyzer__ = SimpleAnalyzer() id = db.Column(db.Integer, primary_key=True) title = db.Column(db.String(100), nullable=False) tag1 = db.Column(db.String(100), nullable=True) tag2 = db.Column(db.String(100), nullable=True) tag3 = db.Column(db.String(100), nullable=True) short_content = db.Column(db.String(512), nullable=True) content = db.Column(db.Text, nullable=False) update_time = db.Column(db.DateTime, default=datetime.now) create_time = db.Column(db.DateTime, default=datetime.now()) author_id = db.Column(db.Integer, db.ForeignKey('user.id')) category_id = db.Column(db.Integer, db.ForeignKey('category.id')) user = db.relationship('User', backref=db.backref('articles')) # 正向与反向引用 category = db.relationship('Category', backref=db.backref('articles')) # 正向与反向引用 is_delete = db.Column(db.Boolean, default=0)
class NewsSource(db.Model): __searchable__ = ['name'] __analyzer__ = SimpleAnalyzer() id = db.Column(db.Integer, primary_key=True) name = db.Column(db.String(32)) slug = db.Column(db.String(32)) feed_url = db.Column(db.String(80)) image_path = db.Column(db.String(80)) def __init__(self, name, image_path=''): self.name = name self.slug = Tokenizer().strip_all_punctuation(name.lower()).replace( ' ', '_') if not image_path: self.image_path = '/static/images/news/sources/{0}.png'.format( self.slug) else: self.image_path = image_path def __repr__(self): return '<NewsSource {0}>'.format(self.name)
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: {0}".format(e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def process_corpus(corpus_path, out_path): """process corpus: split docs into words and return tokenized corpus""" corpus = get_trec_doc(corpus_path) # set tokenizer tokenizer = SimpleAnalyzer() # tokenize corpus and store into words print("tokenizing corpus...") words = [] dfreqs = {} docs = {} for docno, doc in corpus: # tokenize docs doc_tokens = [token.text for token in tokenizer(doc)] # assign tokens docs[docno] = doc_tokens words.extend(doc_tokens) # update doc frequencies for token in set(doc_tokens): if token in dfreqs: dfreqs[token] += 1 else: dfreqs[token] = 1 print("corpus tokenized!") print("computing IDF scores for words within corpus") idfs = { token: np.log(len(docs) / (1 + float(dfreq))) for token, dfreq in dfreqs.items() } print("store processed data") with open(out_path + '/words.json', 'w') as file_words: json.dump(words, file_words) with open(out_path + '/docs.json', 'w') as file_docs: json.dump(docs, file_docs) with open(out_path + '/idfs.json', 'w') as file_idfs: json.dump(idfs, file_idfs) return words
def get_terms(self, unit): """Return list of term pairs for an unit.""" words = set() source_language = unit.translation.component.project.source_language # Filters stop words for a language try: stopfilter = StopFilter(lang=source_language.base_code) except NoStopWords: stopfilter = StopFilter() # Prepare analyzers # - basic simple analyzer to split on non-word chars # - simple analyzer just splits words based on regexp to catch in word dashes # - language analyzer if available (it is for English) analyzers = [ SimpleAnalyzer() | stopfilter, SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter, LanguageAnalyzer(source_language.base_code), ] # Add ngram analyzer for languages like Chinese or Japanese if source_language.uses_ngram(): analyzers.append(NgramAnalyzer(4)) # Extract words from all plurals and from context flags = unit.all_flags for text in unit.get_source_plurals() + [unit.context]: text = strip_string(text, flags).lower() for analyzer in analyzers: # Some Whoosh analyzers break on unicode try: words.update(token.text for token in analyzer(text)) except (UnicodeDecodeError, IndexError): report_error(cause="Term words parsing") if len(words) > 1000: break if len(words) > 1000: break if "" in words: words.remove("") if not words: # No extracted words, no glossary return self.none() # Build the query for fetching the words # We want case insensitive lookup words = islice(words, 1000) if settings.DATABASES["default"][ "ENGINE"] == "django.db.backends.postgresql": # Use regex as that is utilizing pg_trgm index results = self.filter( source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])". format("|".join(re_escape(word) for word in words)), ) else: # MySQL results = self.filter( reduce( lambda x, y: x | y, (models.Q(source__search=word) for word in words), ), ) return results.for_project(unit.translation.component.project).filter( language=unit.translation.language)
def tokenize_query(self, q): """lowerize and tokenize query""" analyzer = SimpleAnalyzer() return [token.text for token in analyzer(q)]
def schema_type(self): return TEXT(stored=True, analyzer=SimpleAnalyzer())
def tokenize_query(self, query): """tokenize query""" analyzer = SimpleAnalyzer() return [token.text for token in analyzer(query)]
import pickle from docx import Document import PyPDF2 from bs4 import BeautifulSoup from whoosh import fields, index from whoosh.analysis import SimpleAnalyzer, StopFilter sys.path.append( os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir, os.pardir)) from searchEngine.seconfig import SearchEngineConfig import time WHOOSH_SCHEMA = fields.Schema(title=fields.TEXT(stored=True), path=fields.ID(stored=True, unique=True), content=fields.TEXT(analyzer=SimpleAnalyzer() | StopFilter(), stored=False)) FILE_INDEXED_LIST = [] # Creates a list of all the files in the lookup directory def list_all_files(): file_name_list = [] for path, subdirs, files in os.walk( SearchEngineConfig.DOCUMENT_LOOKUP_DIRECTORY): for name in files: extension = os.path.splitext(name)[1] if extension in SearchEngineConfig.SUPPORTED_EXTENSIONS: file_name_list.append(str(os.path.join(path, name))) return file_name_list