Esempio n. 1
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()

        # Prepare analyzers
        # - standard analyzer simply splits words
        # - stemming extracts stems, to catch things like plurals
        analyzers = [
            (SimpleAnalyzer(), True),
            (SimpleAnalyzer(expression=SPLIT_RE, gaps=True), True),
            (StandardAnalyzer(), False),
            (StemmingAnalyzer(), False),
        ]
        source_language = unit.translation.subproject.project.source_language
        lang_code = source_language.base_code()
        # Add per language analyzer if Whoosh has it
        if has_stemmer(lang_code):
            analyzers.append((LanguageAnalyzer(lang_code), False))
        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append((NgramAnalyzer(4), False))

        # Extract words from all plurals and from context
        for text in unit.get_source_plurals() + [unit.context]:
            for analyzer, combine in analyzers:
                # Some Whoosh analyzers break on unicode
                new_words = []
                try:
                    new_words = [token.text for token in analyzer(text)]
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error, sys.exc_info())
                words.update(new_words)
                # Add combined string to allow match against multiple word
                # entries allowing to combine up to 5 words
                if combine:
                    words.update([
                        ' '.join(new_words[x:y]) for x in range(len(new_words))
                        for y in range(1, min(x + 6,
                                              len(new_words) + 1)) if x != y
                    ])

        # Grab all words in the dictionary
        dictionary = self.filter(project=unit.translation.subproject.project,
                                 language=unit.translation.language)

        if '' in words:
            words.remove('')

        if len(words) == 0:
            # No extracted words, no dictionary
            dictionary = dictionary.none()
        else:
            # Build the query for fetching the words
            # Can not use __in as we want case insensitive lookup
            dictionary = dictionary.filter(source__iregex=r'^({0})$'.format(
                '|'.join([re_escape(word) for word in words])))

        return dictionary
Esempio n. 2
0
class Question(db.Model, BaseMixin, DateTimeMixin):
    __tablename__ = 'questions'
    __searchable__ = ['title']
    __analyzer__ = SimpleAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String(64))  # 问题名称
    description = db.Column(db.Text)  # 问题描述
    author_id = db.Column(db.Integer, db.ForeignKey('users.id'))  # 提出问题的人
    topics = db.relationship('QuestionTopic',
                             backref='question',
                             lazy='dynamic',
                             foreign_keys=[QuestionTopic.question_id])  # 属于的话题
    favorites = db.relationship('QuestionFavorite',
                                backref='question',
                                lazy='dynamic',
                                foreign_keys=[QuestionFavorite.question_id],
                                cascade='all,delete-orphan')
    answers = db.relationship('Answer', backref='question',
                              lazy='dynamic')  # 回答者
    browse_count = db.Column(db.Integer, default=0)  # 问题被浏览了多少次
    anonymous = db.Column(db.Boolean, default=False)  # 是否匿名提问
    disable_comment = db.Column(db.Boolean, default=False)  # 是否禁止评论
    comments = db.relationship('Comment', backref='question', lazy='dynamic')
    followers = db.relationship('FollowQuestion',
                                backref=db.backref('followed', lazy='joined'),
                                lazy='dynamic',
                                foreign_keys=[FollowQuestion.followed_id],
                                cascade='all,delete-orphan')
    answers_count = db.Column(db.Integer)
    comments_count = db.Column(db.Integer)
    followers_count = db.Column(db.Integer)

    def disable(self):
        self.disable_comment = True
        db.session.add(self)

    @hybrid_property
    def browsed(self):
        return self.browse_count

    @browsed.setter
    def browsed(self, val):
        self.browse_count = val
        db.session.add(self)
        db.session.commit()

    def is_followed_by(self, user):
        return self.followers.filter_by(
            follower_id=user.id).first() is not None

    @property
    def q_topics(self):
        return [i.topic for i in self.topics.all()]

    @property
    def undelete_comments(self):
        return self.comments.filter(Comment.was_delete == False)

    def __repr__(self):
        return '<Question {}>'.format(self.title)
Esempio n. 3
0
	def tokenize_query(self, query, use_concepts=False):
		"""tokenize query"""
		analyzer = SimpleAnalyzer()
		if use_concepts:  # uppercase tokens since CUIs are in uppercase 
			return [token.text.upper() for token in analyzer(query)]
		else:
			return [token.text for token in analyzer(query)]
def index(index_dir, final_file):
    print("\trun index...")
    analyzer = SimpleAnalyzer(expression=r"[\w,.\"\\\-:\'_ ]+")
    schema = Schema(names=TEXT(stored=True),
                    data=STORED,
                    films=TEXT(stored=True, analyzer=analyzer))
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)

    ix = create_in(index_dir, schema)
    writer_actor = ix.writer()

    with gzip.open(final_file, 'rb') as f:
        count = 0

        for line in f:
            decode_line = line.decode("utf-8")
            count += 1

            array_line = handler.return_array(decode_line)
            names = array_line[0].split(
                '\t') if array_line[0] != "NONE" else []
            aliases = array_line[1].split(
                '\t') if array_line[1] != "NONE" else []
            writer_actor.add_document(names="\t".join(names + aliases),
                                      data=array_line[2:4],
                                      films="@".join(array_line[4:]))

    print("\t\twrite index...")
    writer_actor.commit()
    del writer_actor
Esempio n. 5
0
 def initialize_index(self):
     results = db.cachedb.execute_and_fetchall(
         "SELECT card_id, fields FROM card_index_keywords")
     schema = Schema(title=ID(stored=True),
                     idolized=BOOLEAN,
                     short=TEXT,
                     owned=NUMERIC,
                     chara=TEXT,
                     rarity=TEXT,
                     color=TEXT,
                     skill=TEXT,
                     carnival=TEXT,
                     leader=TEXT,
                     fes=TEXT,
                     noir=TEXT,
                     blanc=TEXT,
                     main_attribute=TEXT,
                     time_prob_key=TEXT,
                     content=TEXT(analyzer=SimpleAnalyzer()))
     ix = create_in(INDEX_PATH, schema)
     writer = ix.writer()
     logger.debug("Initializing quicksearch index for {} cards".format(
         len(results)))
     for result in results:
         fields = ast.literal_eval(result[1])
         content = " ".join([fields[key] for key in KEYWORD_KEYS_STR_ONLY])
         writer.add_document(title=str(result[0]),
                             content=content,
                             **fields)
     writer.commit()
     self.index = ix
     logger.debug("Quicksearch index initialized for {} cards".format(
         len(results)))
Esempio n. 6
0
class Category(db.Model):

    __tablename__ = "category"
    __searchable__ = ['title', 'content']
    __analyzer__ = SimpleAnalyzer()

    id = db.Column(db.Integer(), primary_key=True, nullable=False)
    user = db.Column(db.Integer(), db.ForeignKey('users.id'), nullable=False)
    title = db.Column(db.String(30), nullable=False)
    content = db.Column(db.Text(100000), nullable=False)
    update_time = db.Column(db.DateTime,
                            nullable=False,
                            default=datetime.utcnow())
    collect_num = db.Column(db.Integer(), default=0, nullable=False)
    commented = db.relationship(
        'Comment',  # 记录文章的评论数
        backref=db.backref('post', lazy='joined'),
        lazy='dynamic',
        cascade='all, delete-orphan')

    disabled = db.Column(db.Boolean, nullable=False, default=False)

    favorite = db.relationship(
        'Favorite',  # 关注这篇文章的人
        foreign_keys=[Favorite.favorited_id],
        backref=db.backref('favorited', lazy='joined'),
        lazy='dynamic',
        cascade='all, delete-orphan')

    def __repr__(self):

        data = {'title': self.title, 'content': self.content}
        return str(data)
Esempio n. 7
0
    def index(self, index_name='unified'):
        types = self.get_requested_content_types()

        from whoosh.fields import TEXT, ID, NGRAM, NUMERIC
        from whoosh.analysis import StemmingAnalyzer, SimpleAnalyzer, IDAnalyzer
        from whoosh.analysis.filters import LowercaseFilter
        simp_ana = SimpleAnalyzer()
        print 'Building %s index...' % index_name

        # build a single schema from the fields exposed by the different search
        # types
        print '\tSchema:'
        fields = {}
        for type in types:
            for info in type.get_fields_info().values():
                if info['whoosh']['name'] not in fields and not info[
                        'whoosh'].get('ignore', False):
                    print '\t\t%s' % info
                    field_type = info['whoosh']['type']

                    if index_name == 'autocomplete':
                        # break the long text fields into terms, leave the
                        # others as single expression
                        if not (field_type.__class__ == NUMERIC):
                            if info.get('long_text', False):
                                field_type = TEXT(analyzer=simp_ana)
                            else:
                                field_type = ID(stored=True,
                                                analyzer=IDAnalyzer()
                                                | LowercaseFilter())
                    print '\t\t%s' % field_type
                    fields[info['whoosh']['name']] = field_type

                    # JIRA 508 - Add an ID counterpart to allow exact phrase search
#                     if info.get('long_text', False):
#                         fields[info['whoosh']['name']+'_iexact'] = ID(analyzer=IDAnalyzer(lowercase=True))

        from whoosh.fields import Schema
        schema = Schema(**fields)

        # Create the index schema
        index = self.recreate_index(index_name, schema)

        # Add documents to the index
        print '\tWrite indexes:'
        writer = index.writer()
        aci = {}
        for type in types:
            count = type.write_index(writer, self.is_verbose(), aci)
            print '\t\t%s %s records indexed' % (count,
                                                 type.get_model().__name__)

        # autocomplete
        if index_name == 'unified':
            f = open(types[0].get_autocomplete_path(True), 'w')
            f.write((ur'|'.join(aci.keys())).encode('utf8'))
            f.close()

        writer.commit()
Esempio n. 8
0
    def _mk_schema(self, dsinfo):
        from whoosh import fields as wf
        from whoosh.analysis import SimpleAnalyzer

        # haven for terms that have been found to be undefined
        # (for faster decision-making upon next encounter)
        # this will harvest all discovered term definitions
        definitions = {
            '@id': 'unique identifier of an entity',
            # TODO make proper JSON-LD definition
            'path':
            'path name of an entity relative to the searched base dataset',
            # TODO make proper JSON-LD definition
            'parentds': 'path of the datasets that contains an entity',
            # 'type' will not come from a metadata field, hence will not be detected
            'type': 'type of a record',
        }

        schema_fields = {
            n.lstrip('@'): wf.ID(stored=True, unique=n == '@id')
            for n in definitions
        }

        lgr.debug('Scanning for metadata keys')
        # quick 1st pass over all dataset to gather the needed schema fields
        log_progress(
            lgr.info,
            'idxschemabuild',
            'Start building search schema',
            total=len(dsinfo),
            label='Building search schema',
            unit=' Datasets',
        )
        for res in query_aggregated_metadata(
                # XXX TODO After #2156 datasets may not necessarily carry all
                # keys in the "unique" summary
                reporton='datasets',
                ds=self.ds,
                aps=[dict(path=self.ds.path, type='dataset')],
                recursive=True):
            meta = res.get('metadata', {})
            # no stringification of values for speed, we do not need/use the
            # actual values at this point, only the keys
            idxd = _meta2autofield_dict(meta, val2str=False)

            for k in idxd:
                schema_fields[k] = wf.TEXT(stored=False,
                                           analyzer=SimpleAnalyzer())
            log_progress(lgr.info,
                         'idxschemabuild',
                         'Scanned dataset at %s',
                         res['path'],
                         update=1,
                         increment=True)
        log_progress(lgr.info, 'idxschemabuild', 'Done building search schema')

        self.schema = wf.Schema(**schema_fields)
Esempio n. 9
0
class Topic(db.Model, BaseMixin, DateTimeMixin):
    __tablename__ = 'topics'
    __analyzer__ = SimpleAnalyzer()
    __searchable__ = ['title']
    id = db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String(64))  # 话题名称
    description = db.Column(db.Text)  # 话题描述
    cover_url = db.Column(db.String(256))  # 封面图片
    cover_url_sm = db.Column(db.String(256))  # 封面缩略图片
    author_id = db.Column(db.Integer, db.ForeignKey('users.id'))  # 创建人
    followers = db.relationship('FollowTopic',
                                backref=db.backref('followed', lazy='joined'),
                                lazy='dynamic',
                                foreign_keys=[FollowTopic.followed_id],
                                cascade='all,delete-orphan')
    questions = db.relationship('QuestionTopic',
                                backref='topic',
                                lazy='dynamic',
                                foreign_keys=[QuestionTopic.topic_id])  # 包括的问题
    follower_count = db.Column(db.Integer)
    questions_count = db.Column(db.Integer)

    def __repr__(self):
        return '<Topic {}>'.format(self.title)

    def add_question(self, question):
        if not self.is_in_topic(question):
            QuestionTopic.create(topic=self, question=question)
            return True
        return False

    def remove_question(self, question):
        f = self.questions.filter_by(question_id=question.id).first()
        if f:
            db.session.delete(f)
            db.session.commit()

    def is_in_topic(self, question):

        return self.questions.filter_by(
            question_id=question.id).first() is not None

    @classmethod
    def topic_exists(cls, title):
        return cls.query.filter_by(title=title).first() is not None

    @classmethod
    def generate_topics(cls):
        for each in topics:
            Topic.create(title=each, description='话题描述')

    def is_followed_by(self, user):
        return self.followers.filter_by(
            follower_id=user.id).first() is not None
Esempio n. 10
0
    def get_words(self, unit):
        """Return list of word pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - simple analyzer just splits words based on regexp
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError) as error:
                    report_error(error)
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if '' in words:
            words.remove('')

        if not words:
            # No extracted words, no dictionary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        return self.filter(
            project=unit.translation.component.project,
            language=unit.translation.language,
            source__iregex=r'(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])'.format(
                '|'.join(re_escape(word) for word in islice(words, 1000))),
        )
Esempio n. 11
0
def exec_comp():
    '''
    Method that calculates MRR: Mean Reciprocal Rank and saves a table with MRR evaluation for every search engine configuration 
    '''
    #text analyzers
    selected_analyzers = [
        StemmingAnalyzer(),
        SimpleAnalyzer(),
        StandardAnalyzer(),
        RegexAnalyzer(),
        FancyAnalyzer(),
        NgramAnalyzer(5),
        KeywordAnalyzer(),
        LanguageAnalyzer('en')
    ]  #text analyzers
    sel_ana = [
        'StemmingAnalyzer()', 'SimpleAnalyzer()', 'StandardAnalyzer()',
        'RegexAnalyzer()', 'FancyAnalyzer()', 'NgramAnalyzer(5)',
        'KeywordAnalyzer()', 'LanguageAnalyzer()'
    ]  #text which will be used for graph and for mrr table

    i = 0  #counter
    mrrs = []  #list where MRR values for each SE configuration will be stored

    #scoring functions
    scoring_functions = [
        scoring.TF_IDF(),
        scoring.Frequency(),
        scoring.BM25F(B=0.75, content_B=1.0, K1=1.5)
    ]
    scor_func = [' TF_IDF', ' Frequency', ' BM25F']

    #ground truth
    gt1 = pd.read_csv(os.getcwd() +
                      "/part_1/Cranfield_DATASET/cran_Ground_Truth.tsv",
                      sep='\t')

    #combinations for every chosen analyzer with every chosen scoring function
    for x in range(len(selected_analyzers)):
        for y in range(len(scoring_functions)):
            print(sel_ana[x] + scor_func[y])
            i = i + 1
            sr_1 = exec_queries(
                selected_analyzers[x], scoring_functions[y]
            )  # execute queries for the chosen configuration combination
            sr_1.to_csv(os.getcwd() + "/part_1/" + str(i) + "__.csv",
                        index=False)  #save results of the search engine
            mrrs.append((sel_ana[x] + scor_func[y], mrr(gt1,
                                                        sr_1)))  #calculate MRR
    mrrs_saving = pd.DataFrame(mrrs)
    mrrs_saving.to_csv(os.getcwd() + "/part_1/mrrs.csv",
                       index=False)  #store MRR table
Esempio n. 12
0
class BlogPost(db.Model):
    __tablename__ = 'user'
    __searchable__ = ['name',
                      'email']  # these fields will be indexed by whoosh
    __analyzer__ = SimpleAnalyzer()  # configure analyzer; defaults to
    # StemmingAnalyzer if not specified

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String)  # Indexed fields are either String,
    password = db.Column(db.String)  # Unicode, or Text
    email = db.Column(db.String)  # Unicode, or Text
    created = db.Column(db.DateTime, default=datetime.datetime.utcnow)
    status = db.Column(db.Integer)
Esempio n. 13
0
    def new_searching_field(self, field_name, field_data):
        """
        Add a new searching field. It will be used by the search engine recommender.
        If the schema is not yet defined the writer will add the field_name inside the schema

        Args:
            field_name (str): Name of the new field
            field_data: Data to put into the field
        """
        if not self.__schema_defined:
            self.__writer.add_field(
                field_name, TEXT(stored=True, analyzer=SimpleAnalyzer()))
        self.__doc[field_name] = field_data
Esempio n. 14
0
class NewsSummary(Summary):
    __searchable__ = ['bullets']
    __analyzer__ = SimpleAnalyzer()

    __mapper_args__ = {
        'polymorphic_identity': 'newssummary',
        'inherit_condition': (id == Summary.id)
    }

    title = db.Column(db.String)
    pub_date = db.Column(db.DateTime)
    image_path = db.Column(db.String(80))

    news_source_id = db.Column(db.Integer, db.ForeignKey('news_source.id'))
    news_source = db.relationship('NewsSource',
                                  backref=db.backref('summaries',
                                                     lazy='dynamic'))

    news_category_id = db.Column(db.Integer, db.ForeignKey('news_category.id'))
    news_category = db.relationship('NewsCategory',
                                    backref=db.backref('summaries',
                                                       lazy='dynamic'))

    def __init__(
        self,
        title,
        bullets,
        highlighted_text,
        news_source,
        source_url,
        news_category,
        date_added=None,
        pub_date=None,
        image_path='',
    ):
        super(NewsSummary, self).__init__(bullets,
                                          highlighted_text,
                                          source_url=source_url,
                                          date_added=date_added)

        self.title = title
        self.news_source = news_source
        self.news_category = news_category
        self.pub_date = pub_date
        if image_path:
            self.image_path = image_path
        else:
            self.image_path = news_source.image_path

    def __repr__(self):
        return '<NewsSummary {0}>'.format(self.title)
Esempio n. 15
0
    def _schema(self):
        # Creates a schema given this object's mingram and maxgram attributes.

        from whoosh.fields import Schema, FieldType, ID, STORED
        from whoosh.formats import Frequency
        from whoosh.analysis import SimpleAnalyzer

        idtype = ID()
        freqtype = FieldType(Frequency(), SimpleAnalyzer())

        fls = [("word", STORED), ("score", STORED)]
        for size in xrange(self.mingram, self.maxgram + 1):
            fls.extend([("start%s" % size, idtype), ("end%s" % size, idtype),
                        ("gram%s" % size, freqtype)])

        return Schema(**dict(fls))
Esempio n. 16
0
	def get_syns(self, term2cui, term_dict):
		"""get synonymic relations between words within corpus (derived from a semantic lexicon)"""
		syns = {}
		umls_lookup = umls.UMLSLookup()
		analyzer = SimpleAnalyzer()
		for term, cui in term2cui.items():
			if term in term_dict:
				if cui != '__NULL__':
					# get synset composed of single-word terms (reference term excluded)
					synset = {syn[0].lower() for syn in umls_lookup.lookup_synonyms(cui, preferred=False) if len(list(analyzer(syn[0]))) == 1 and syn[0].lower() in term_dict and syn[0].lower() != term}
					if len(synset) > 0:
						syns[term] = list(synset)
					else:
						syns[term] = list()
				else:
					syns[term] = list()
		return syns
Esempio n. 17
0
class Answer(db.Model, BaseMixin, DateTimeMixin):
    __tablename__ = 'answers'
    __searchable__ = ['body']
    __analyzer__ = SimpleAnalyzer()
    id = db.Column(db.Integer, primary_key=True)
    body = db.Column(db.Text)  # 回答详情
    author_id = db.Column(db.Integer, db.ForeignKey('users.id'))  # 作者
    question_id = db.Column(db.Integer,
                            db.ForeignKey('questions.id'))  # 属于哪个问题
    favorites = db.relationship('AnswerFavorite',
                                lazy='dynamic',
                                backref='answer',
                                foreign_keys=[AnswerFavorite.answer_id],
                                cascade='all,delete-orphan')
    anonymous = db.Column(db.Boolean, default=False)  # 作者是否匿名回答
    disable_comment = db.Column(db.Boolean, default=False)  # 是否禁止评论
    comments = db.relationship('Comment', backref='answer', lazy='dynamic')
    liked_answers = db.relationship('LikeAnswer',
                                    backref=db.backref('answer_liked',
                                                       lazy='joined'),
                                    lazy='dynamic',
                                    foreign_keys=[LikeAnswer.answer_liked_id],
                                    cascade='all,delete-orphan')
    liked_count = db.Column(db.Integer, default=0)  # 以点赞数排序
    comments_count = db.Column(db.Integer)

    def count_ping(self):
        """点赞,取消赞之后调用"""
        self.liked_count = self.liked_answers.count()
        db.session.add(self)
        db.session.commit()

    def is_liked_by(self, user):
        return self.liked_answers.filter_by(
            like_answer_id=user.id).first() is not None

    def disable(self):
        self.disable_comment = True
        db.session.add(self)

    @property
    def undelete_comments(self):
        return self.comments.filter(Comment.was_delete == False)

    def __repr__(self):
        return '<Answer {}>'.format(self.id)
    def test_custom_analyzer(self):
        from whoosh.analysis import SimpleAnalyzer
        self.app.config['WHOOSH_ANALYZER'] = SimpleAnalyzer()
        db.init_app(self.app)
        db.create_all()
        db.session.add(ObjectA(title='jumping', content=''))
        db.session.commit()
        assert not list(ObjectA.query.whoosh_search('jump'))
        assert ['jumping'] == [obj.title for obj in ObjectA.query.whoosh_search('jumping')]

        db.session.add(ObjectD(title='Travelling', content='Stemming'))
        db.session.add(ObjectD(title='travel', content='Unstemmed and normal'))
        db.session.add(ObjectD(title='trevel', content='Mispelt'))

        db.session.commit()
        # When mispelt on either the indexed side or the query side, they should all return 3 due to the DoubleMetaphoneFilter
        self.assertEqual(len(list(ObjectD.query.whoosh_search('travelling'))), 3)
        self.assertEquals(len(list(ObjectD.query.whoosh_search('trovel'))), 3)
def get_syns(term2cui, word_dict, umls_lookup):
    """get synonymy relations from corpus and lexicon as a dictionary"""
    syns = {}
    analyzer = SimpleAnalyzer()
    for term, cui in term2cui.items():
        if cui != "__NULL__":
            synset = {
                word_dict[syn[0].lower()]
                for syn in umls_lookup.lookup_synonyms(cui, preferred=False)
                if len(list(analyzer(syn[0]))) == 1
                and syn[0].lower() in word_dict and syn[0].lower() != term
            }
            if len(synset) > 0:
                syns[word_dict[term]] = list(synset)
            else:
                syns[word_dict[term]] = []
        else:
            syns[word_dict[term]] = []
    return syns
Esempio n. 20
0
 def initialize_chart_index(self):
     results = db.cachedb.execute_and_fetchall(
         "SELECT live_detail_id, performers, special_keys, jp_name, name, level, color, difficulty FROM live_detail_cache"
     )
     schema = Schema(title=ID(stored=True),
                     live_detail_id=NUMERIC,
                     performers=TEXT,
                     special_keys=TEXT,
                     jp_name=TEXT,
                     name=TEXT,
                     difficulty=TEXT,
                     level=NUMERIC,
                     color=TEXT,
                     content=TEXT(analyzer=SimpleAnalyzer()))
     ix = create_in(INDEX_PATH, schema, indexname="score")
     writer = ix.writer()
     logger.debug("Initializing quicksearch index for {} charts".format(
         len(results)))
     for result in results:
         difficulty = Difficulty(result[-1]).name.lower()
         performers = result[1].replace(",", "") if result[1] else ""
         color = Color(result[6] - 1).name.lower()
         content = " ".join([
             performers, result[2] if result[2] else "", result[3],
             result[4], difficulty, color,
             str(result[5])
         ])
         writer.add_document(
             title=str(result[0]),
             content=content,
             live_detail_id=result[0],
             performers=performers,
             special_keys=result[2],
             jp_name=result[3],
             name=result[4],
             level=result[5],
             color=color,
             difficulty=difficulty,
         )
     writer.commit()
     self.song_index = ix
     logger.debug("Quicksearch index initialized for {} charts".format(
         len(results)))
Esempio n. 21
0
def open_index(indexdir, incremental=False):
    """
    Opens the index with the given name. If the directory or the index
    do not yet exist, the are created.

    @type  indexdir: str
    @param indexdir: The name of the index directory.
    @type  incremental: bool
    @param incremental: Whether to preserve existing index content.
    @rtype:  whoosh.Index
    @return: An object representing the index.
    """
    if not os.path.exists(indexdir):
        os.makedirs(indexdir)
    if incremental and index.exists_in(indexdir):
        return index.open_dir(indexdir)
    schema = Schema(number=NUMERIC(stored=True),
                    filename=ID(stored=True),
                    line=TEXT(analyzer=SimpleAnalyzer(), stored=True))
    return index.create_in(indexdir, schema)
Esempio n. 22
0
class Article(db.Model):
    __tablename__ = 'article'
    __searchable__ = ['title', 'content']  # 搜索字段
    __analyzer__ = SimpleAnalyzer()

    id = db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String(100), nullable=False)
    tag1 = db.Column(db.String(100), nullable=True)
    tag2 = db.Column(db.String(100), nullable=True)
    tag3 = db.Column(db.String(100), nullable=True)
    short_content = db.Column(db.String(512), nullable=True)
    content = db.Column(db.Text, nullable=False)
    update_time = db.Column(db.DateTime, default=datetime.now)
    create_time = db.Column(db.DateTime, default=datetime.now())
    author_id = db.Column(db.Integer, db.ForeignKey('user.id'))
    category_id = db.Column(db.Integer, db.ForeignKey('category.id'))
    user = db.relationship('User', backref=db.backref('articles'))  # 正向与反向引用
    category = db.relationship('Category',
                               backref=db.backref('articles'))  # 正向与反向引用
    is_delete = db.Column(db.Boolean, default=0)
Esempio n. 23
0
class NewsSource(db.Model):
    __searchable__ = ['name']
    __analyzer__ = SimpleAnalyzer()

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(32))
    slug = db.Column(db.String(32))
    feed_url = db.Column(db.String(80))
    image_path = db.Column(db.String(80))

    def __init__(self, name, image_path=''):
        self.name = name
        self.slug = Tokenizer().strip_all_punctuation(name.lower()).replace(
            ' ', '_')
        if not image_path:
            self.image_path = '/static/images/news/sources/{0}.png'.format(
                self.slug)
        else:
            self.image_path = image_path

    def __repr__(self):
        return '<NewsSource {0}>'.format(self.name)
Esempio n. 24
0
    def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True),
                             body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"],
                                                     schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def process_corpus(corpus_path, out_path):
    """process corpus: split docs into words and return tokenized corpus"""
    corpus = get_trec_doc(corpus_path)
    # set tokenizer
    tokenizer = SimpleAnalyzer()
    # tokenize corpus and store into words
    print("tokenizing corpus...")
    words = []
    dfreqs = {}
    docs = {}
    for docno, doc in corpus:
        # tokenize docs
        doc_tokens = [token.text for token in tokenizer(doc)]
        # assign tokens
        docs[docno] = doc_tokens
        words.extend(doc_tokens)
        # update doc frequencies
        for token in set(doc_tokens):
            if token in dfreqs:
                dfreqs[token] += 1
            else:
                dfreqs[token] = 1
    print("corpus tokenized!")
    print("computing IDF scores for words within corpus")
    idfs = {
        token: np.log(len(docs) / (1 + float(dfreq)))
        for token, dfreq in dfreqs.items()
    }
    print("store processed data")
    with open(out_path + '/words.json', 'w') as file_words:
        json.dump(words, file_words)
    with open(out_path + '/docs.json', 'w') as file_docs:
        json.dump(docs, file_docs)
    with open(out_path + '/idfs.json', 'w') as file_idfs:
        json.dump(idfs, file_idfs)
    return words
Esempio n. 26
0
    def get_terms(self, unit):
        """Return list of term pairs for an unit."""
        words = set()
        source_language = unit.translation.component.project.source_language

        # Filters stop words for a language
        try:
            stopfilter = StopFilter(lang=source_language.base_code)
        except NoStopWords:
            stopfilter = StopFilter()

        # Prepare analyzers
        # - basic simple analyzer to split on non-word chars
        # - simple analyzer just splits words based on regexp to catch in word dashes
        # - language analyzer if available (it is for English)
        analyzers = [
            SimpleAnalyzer() | stopfilter,
            SimpleAnalyzer(expression=SPLIT_RE, gaps=True) | stopfilter,
            LanguageAnalyzer(source_language.base_code),
        ]

        # Add ngram analyzer for languages like Chinese or Japanese
        if source_language.uses_ngram():
            analyzers.append(NgramAnalyzer(4))

        # Extract words from all plurals and from context
        flags = unit.all_flags
        for text in unit.get_source_plurals() + [unit.context]:
            text = strip_string(text, flags).lower()
            for analyzer in analyzers:
                # Some Whoosh analyzers break on unicode
                try:
                    words.update(token.text for token in analyzer(text))
                except (UnicodeDecodeError, IndexError):
                    report_error(cause="Term words parsing")
                if len(words) > 1000:
                    break
            if len(words) > 1000:
                break

        if "" in words:
            words.remove("")

        if not words:
            # No extracted words, no glossary
            return self.none()

        # Build the query for fetching the words
        # We want case insensitive lookup
        words = islice(words, 1000)
        if settings.DATABASES["default"][
                "ENGINE"] == "django.db.backends.postgresql":
            # Use regex as that is utilizing pg_trgm index
            results = self.filter(
                source__iregex=r"(^|[ \t\n\r\f\v])({0})($|[ \t\n\r\f\v])".
                format("|".join(re_escape(word) for word in words)), )
        else:
            # MySQL
            results = self.filter(
                reduce(
                    lambda x, y: x | y,
                    (models.Q(source__search=word) for word in words),
                ), )

        return results.for_project(unit.translation.component.project).filter(
            language=unit.translation.language)
Esempio n. 27
0
 def tokenize_query(self, q):
     """lowerize and tokenize query"""
     analyzer = SimpleAnalyzer()
     return [token.text for token in analyzer(q)]
Esempio n. 28
0
 def schema_type(self):
     return TEXT(stored=True, analyzer=SimpleAnalyzer())
 def tokenize_query(self, query):
     """tokenize query"""
     analyzer = SimpleAnalyzer()
     return [token.text for token in analyzer(query)]
Esempio n. 30
0
import pickle
from docx import Document
import PyPDF2
from bs4 import BeautifulSoup
from whoosh import fields, index
from whoosh.analysis import SimpleAnalyzer, StopFilter

sys.path.append(
    os.path.join(os.path.abspath(os.path.dirname(__file__)), os.pardir,
                 os.pardir))
from searchEngine.seconfig import SearchEngineConfig
import time

WHOOSH_SCHEMA = fields.Schema(title=fields.TEXT(stored=True),
                              path=fields.ID(stored=True, unique=True),
                              content=fields.TEXT(analyzer=SimpleAnalyzer()
                                                  | StopFilter(),
                                                  stored=False))
FILE_INDEXED_LIST = []


# Creates a list of all the files in the lookup directory
def list_all_files():
    file_name_list = []
    for path, subdirs, files in os.walk(
            SearchEngineConfig.DOCUMENT_LOOKUP_DIRECTORY):
        for name in files:
            extension = os.path.splitext(name)[1]
            if extension in SearchEngineConfig.SUPPORTED_EXTENSIONS:
                file_name_list.append(str(os.path.join(path, name)))
    return file_name_list