コード例 #1
0
    def searchconn(self):
        """Open a search connection if there isn't one already open.

        """
        if self._sconn is None:
            self._sconn = xappy.SearchConnection(self.db_path)
        return self._sconn
コード例 #2
0
    def findPMIDsWithSynonyms(self, synonyms):
        if self.__searchConn == None:
            self.__searchConn = xappy.SearchConnection(self.__xapianPath)
            self.__searchConn.reopen()

        xapian_querys = []

        for querystring in synonyms:
            title, text, keyword, chemical_exact, mesh = '"' + querystring + '"', '"' + querystring + '"', '"' + querystring + '"', '"' + querystring + '"', '"' + querystring + '"'

            xapian_querys.append(self.__searchConn.query_field('title', title))
            xapian_querys.append(self.__searchConn.query_field('text', text))
            xapian_querys.append(
                self.__searchConn.query_field('keyword', keyword))
            xapian_querys.append(
                self.__searchConn.query_field('chemical_exact',
                                              chemical_exact))
            xapian_querys.append(self.__searchConn.query_field('mesh', mesh))

        merged_q = self.__searchConn.query_composite(self.__searchConn.OP_OR,
                                                     xapian_querys)
        results = self.__searchConn.search(merged_q, 0,
                                           self.__searchConn.get_doccount())

        return [r.id for r in results]
コード例 #3
0
    def search_query(self, keywords):
        sconn = xappy.SearchConnection(self.search_db_dir)

        search = ' '.join(keywords)
        q = sconn.query_parse(search, default_op=sconn.OP_AND)
        results = sconn.search(q, 0, sconn.get_doccount())

        return map(lambda result: result.data["module_uid"][0], results)
コード例 #4
0
ファイル: se.py プロジェクト: ChunHungLiu/watchdog-1
def test():
    sconn = xappy.SearchConnection(DBPATH)
    print sconn.get_doccount(), 'documents loaded.'

    def query(qtext):
        q = sconn.query_parse(sconn.spell_correct(qtext),
                              default_op=sconn.OP_AND)
        return [x.data['id'][0] for x in sconn.search(q, 0, 10)]

    assert query('biden joe') == ['joe_biden']
    assert query('barak obma') == ['barack_obama']
    return True
コード例 #5
0
def _get_data(category):
    # load data from xapian categories
    db = datastore.corpus_db()
    sconn = xappy.SearchConnection(settings.XAPIAN_DB)
    q = sconn.query_field('category', category)

    offset = 0
    limit = 1000
    while True:
        results = sconn.search(q, offset, offset + limit)

    for key, doc in db.range(key_from, key_to):
        if not any(uri in doc['url'] for uri in DISCARD_URLS):
            yield doc
コード例 #6
0
    def get_search_connection(self, names):
        """Get a SearchConnection for a list of connections.

        :Parameters:
         - `names`: The name of a collection, or a sequence of names of
           collections.

        Returns a SearchConnection object.

        """
        if isinstance(names, types.StringTypes):
            names = (names, )
        assert (len(names) > 0)

        # FIXME - handle the case of a collection not being found more gracefully.
        col = self._collections[names[0]]
        result = xappy.SearchConnection(col.dbpath())
        log.debug('Search connection to %r opened' % (names[0], ))

        # Add the remaining databases to the connection in Result.
        # FIXME - this should really be done by xappy.  Currently, we have to
        # access the internal _index object and use it directly - which only
        # works properly if all the collections have the same index properties.
        for name in names[1:]:
            col = self._collections[name]
            result._index.add_database(xapian.Database(col.dbpath()))
            log.debug('Added %r to search connection' % (name, ))

        self._handle_count_condition.acquire()
        try:
            # Register the callback here.
            result.append_close_handler(self._search_connection_closed, names)

            # Now, increment the count of handles.
            for name in names:
                newcount = self._handle_count.get(name, 0) + 1
                self._handle_count[name] = newcount
                log.debug('New connection count for %r is %d' %
                          (name, newcount))
        finally:
            self._handle_count_condition.release()

        return result
コード例 #7
0
    def search_query(self, keywords):
        '''
        init docs
        '''
        # Init search connect.
        search_db_path = os.path.join(UPDATE_DATA_DIR, "search", "zh_CN",
                                      "search_db")
        sconn = xappy.SearchConnection(search_db_path)

        # Do search.
        search = ' '.join(keywords).lower()
        q = sconn.query_parse(search, default_op=sconn.OP_AND)
        results = sconn.search(q,
                               0,
                               sconn.get_doccount(),
                               sortby="have_desktop_file")

        all_results = map(lambda result: result.data["pkg_name"][0], results)
        for keyword in keywords:
            match_names = self.get_pkgs_match_input(keyword)
            for name in match_names:
                if name not in all_results:
                    all_results.append(name)
        return all_results
コード例 #8
0
ファイル: search.py プロジェクト: rmax/yatiri
def main(args):
    # search corpus index
    db = datastore.corpus_db()
    sconn = xappy.SearchConnection(settings.XAPIAN_DB)

    query = ' '.join(args.query)

    print "Search {} documents for '{}'".format(sconn.get_doccount(),
                                                args.query)

    q = sconn.query_parse(query, default_op=sconn.OP_AND)

    if args.category:
        qc = q.compose(
            q.OP_OR, [sconn.query_field('category', c) for c in args.category])
        q = q & qc

    if args.date:
        qd = q.compose(q.OP_OR,
                       [sconn.query_field('date', d) for d in args.date])
        q = q & qd

    if args.date_start and args.date_end:
        qr = sconn.query_range('date', args.date_start, args.date_end)
        q = q.filter(qr)

    if args.sort:
        sortby = [tuple(args.sort.split(','))]
    else:
        sortby = None

    print 'Query: {!r}'.format(q)
    results = execute_query(sconn,
                            q,
                            args.offset,
                            args.limit,
                            getfacets=args.facet,
                            allowfacets=('category', ),
                            sortby=sortby)

    if results.estimate_is_exact:
        print "Found {} results".format(results.matches_estimated)
    else:
        print "Found approximately {} results".format(
            results.matches_estimated)

    for i, result in enumerate(results, 1):
        doc = db[result.id]
        try:
            cat = result.get_terms('category').next()
        except StopIteration:
            cat = 'none'
        try:
            date = result.get_terms('date').next()
        except StopIteration:
            date = 'none'

        print "{:2}. {} -- {} -- {}\n\t{}\n\t{}\n".format(
            i, cat, doc['headline'], date, doc['url'], result.id)

    from IPython import embed
    embed()
コード例 #9
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
    Copyright (c) 2014, Kersten Doering <*****@*****.**>, Bjoern Gruening <*****@*****.**>
"""
#Kersten Doering 04.06.2014

#check https://github.com/miracle2k/xappy/blob/master/docs/introduction.rst for nice examples

import xappy

searchConn = xappy.SearchConnection("xapian/xapian2015")
searchConn.reopen()

#########################

querystring = "pancreatic"

q = searchConn.query_field('title', querystring)

print "search query: ", q

#save all machting documents in "results" (starting with rank 0 - check help documentation of function "search")
results = searchConn.search(q, 0, searchConn.get_doccount())

print "number of matches: ", results.matches_estimated

### debug: ###
#print first 5 titles with highlight function and save first 1000 titles in an HTML file
#print "### first 5 hits: ###"
#print "Rank\tPubMed-ID\tTitle (query term highlighted)"
コード例 #10
0
def main():
    tornado.options.parse_command_line()

    from apps.main.models import User
    from apps.questions.models import Question, Genre
    from mongokit import Connection
    con = Connection()
    con.register([Question, Genre, User])
    db = con.gkc

    if options.reindex_all:
        since = datetime.datetime(1979, 12, 13)
    else:
        since = options.since
        if not since:
            since = default_since
        try:
            since = datetime.datetime.strptime(since, '%Y-%m-%d %H-%M-%S')
        except ValueError:
            since = datetime.datetime.strptime(since, '%Y-%m-%d')
    if options.verbose:
        print 'since', since

    genres = {}
    authors = {}
    count = 0
    search = {'modify_date': {'$gt': since}}
    if not db.Question.find(search).count():
        if options.verbose:
            print "0 questions"
        if not options.test:
            return
    youngest = since

    indexer = xappy.IndexerConnection(settings.XAPIAN_LOCATION)
    if not indexer.get_fields_with_actions() or options.update_fields:
        indexer.add_field_action('question',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 weight=2,
                                 language='en',
                                 spell=True,
                                 stop=stopwords)
        indexer.add_field_action(
            'answer',
            xappy.FieldActions.INDEX_FREETEXT,
            language='en',
            spell=True,
        )
        indexer.add_field_action('accept',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)
        indexer.add_field_action('alternatives',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)
        indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT)
        indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT)
        indexer.add_field_action('comment',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=False,
                                 search_by_default=False,
                                 stop=stopwords)
        indexer.add_field_action('date',
                                 xappy.FieldActions.SORTABLE,
                                 type="date")
        indexer.add_field_action('state', xappy.FieldActions.SORTABLE)

        indexer.add_field_action('question', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('answer', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('state', xappy.FieldActions.STORE_CONTENT)

    t0 = time.time()
    for question in db.Question.collection.find(search):
        if question['modify_date'] > youngest:
            youngest = question['modify_date']
        doc = xappy.UnprocessedDocument()
        doc.fields.append(xappy.Field('state', question['state']))
        doc.fields.append(xappy.Field('question', question['text']))
        doc.fields.append(xappy.Field('answer', question['answer']))
        if question['genre'].id in genres:
            genre = genres[question['genre'].id]
        else:
            genre = db.Genre.one({'_id': question['genre'].id})
            genre = genre.name
            genres[question['genre'].id] = genre
        doc.fields.append(xappy.Field('genre', genre))
        if question['author'].id in authors:
            author = authors[question['author'].id]
        else:

            author = db.User.one({'_id': question['author'].id})
            author = author.username
            authors[question['author'].id] = author
        doc.fields.append(xappy.Field('author', author))
        doc.fields.append(xappy.Field('comment', question['comment']))
        doc.fields.append(xappy.Field('accept', '\n'.join(question['accept'])))
        doc.fields.append(
            xappy.Field('alternatives', '\n'.join(question['alternatives'])))
        doc.id = str(question['_id'])
        pdoc = indexer.process(doc)
        indexer.replace(pdoc)
        count += 1
        #if count and not count % 100:
        #    indexer.flush()
    # add a second to avoid milliseconds causing the same doc to be index over and over
    youngest += datetime.timedelta(seconds=1)
    open(since_filename, 'w').write(youngest.strftime('%Y-%m-%d %H-%M-%S\n'))

    indexer.flush()
    t1 = time.time()
    indexer.close()
    if options.verbose:
        print round(t1 - t0, 3), "seconds to index", count, "questions"

    # test
    if options.test:
        print settings.XAPIAN_LOCATION
        searcher = xappy.SearchConnection(settings.XAPIAN_LOCATION)
        text = 'FRAMCEs capitalls'
        text = "Capitol STATE"
        print searcher.spell_correct(text)
        query = searcher.query_field('question',
                                     text,
                                     default_op=searcher.OP_OR)
        results = searcher.search(query, 0, 10)
        print results.matches_estimated
        #print results.estimate_is_exact
        for result in results:
            print result.rank, result.id
            print repr(result.summarise('question')), result.data['state'][0]
            #result.data['state']

        text = 'london'
        query = searcher.query_field('answer', text, default_op=searcher.OP_OR)
        results = searcher.search(query, 0, 10)
        print results.matches_estimated
        #print results.estimate_is_exact
        for result in results:
            print result.rank, result.id
            print repr(result.summarise('question')), result.data['state'][0]
コード例 #11
0
# log start time
start = time.asctime()

# set options verbose, output and whether PostgreSQL should be used to display results
output = True
verbose = True
debug = False
use_psql = True

# get path to this script
root = os.getcwd()

# search connection to Xapian full text index
xapianPath = os.path.join(root, "xapian_PMC_complete")
searchConn = xappy.SearchConnection(xapianPath)
searchConn.reopen()


# get PMC texts from PostgreSQL
def get_text(pmcid):
    stmt = """
    SELECT 
        text
    FROM 
        public.tbl_pmcid_text
    WHERE
        pmcid = %s
    ;
    """
    cursor.execute(stmt, (pmcid, ))
コード例 #12
0
ファイル: bench.py プロジェクト: dongshige/wikidpad
 def searcher(self):
     path = os.path.join(self.options.dir,
                         "%s_xappy" % self.options.indexname)
     return xappy.SearchConnection(path)
コード例 #13
0
def query(s):
    sconn = xappy.SearchConnection(DBPATH)
    q = sconn.query_parse(sconn.spell_correct(s), default_op=sconn.OP_AND)
    return [x.data['id'][0] for x in sconn.search(q, 0, 10)]
コード例 #14
0
    def create_index(self):
        self.iconn = xappy.IndexerConnection(self.dbpath)
        self.sconn = xappy.SearchConnection(self.dbpath)

        # keys are filtered package names or "_last_run_"
        self.iconn.add_field_action('key', xappy.FieldActions.INDEX_EXACT)
コード例 #15
0
ファイル: search.py プロジェクト: azizur77/electionspot
def search(s, page=0):
    conn = xappy.SearchConnection(config.search_db)
    q = conn.query_parse(conn.spell_correct(s))
    result = conn.search(q, page*20, page*20+20)
    return result.matches_estimated, [x.data for x in result]
コード例 #16
0
ファイル: views.py プロジェクト: ariel/Spacelog
    def get_context_data(self):
        # Get the query text
        q = self.request.GET.get('q', '')
        # Get the offset value
        try:
            offset = int(self.request.GET.get('offset', '0'))
            if offset < 0:
                offset = 0
        except ValueError:
            offset = 0

        # Is it a special search?
        special_value = self.request.redis_conn.get("special_search:%s:%s" % (
            self.request.mission.name,
            q,
        ))
        if special_value:
            self.template_name = "search/special.html"
            return {
                "q": q,
                "text": special_value,
            }

        # Get the results from Xapian
        db = xappy.SearchConnection(
            os.path.join(
                settings.SITE_ROOT,
                '..',
                "xappydb",
            ), )
        query = db.query_parse(
            q,
            default_op=db.OP_OR,
            deny=["mission"],
        )
        query = db.query_filter(
            query,
            db.query_composite(db.OP_AND, [
                db.query_field("mission", self.request.mission.name),
                db.query_field("transcript",
                               self.request.mission.main_transcript),
            ]))
        results = db.search(
            query=query,
            startrank=offset,
            endrank=offset + PAGESIZE,
            checkatleast=
            -1,  # everything (entire xapian db fits in memory, so this should be fine)
            sortby="-weight",
        )
        # Go through the results, building a list of LogLine objects
        redis_conn = self.request.redis_conn
        log_lines = []
        for result in results:
            transcript_name, timestamp = result.id.split(":", 1)
            log_line = LogLine(redis_conn, transcript_name, int(timestamp))
            log_line.speaker = Character(redis_conn,
                                         transcript_name.split('/')[0],
                                         result.data['speaker_identifier'][0])
            log_line.title = mark_safe(
                result.summarise("text",
                                 maxlen=50,
                                 ellipsis='&hellip;',
                                 strict_length=True,
                                 hl=None))
            log_line.summary = mark_safe(
                result.summarise("text",
                                 maxlen=600,
                                 ellipsis='&hellip;',
                                 hl=('<mark>', '</mark>')))
            log_lines.append(log_line)

        def page_url(offset):
            return reverse("search") + '?' + urllib.urlencode(
                {
                    'q': q.encode('utf-8'),
                    'offset': offset,
                })

        if offset == 0:
            previous_page = False
        else:
            previous_page = page_url(offset - PAGESIZE)

        if offset + PAGESIZE > results.matches_estimated:
            next_page = False
        else:
            next_page = page_url(offset + PAGESIZE)

        thispage = offset / PAGESIZE
        maxpage = results.matches_estimated / PAGESIZE

        pages_to_show = set([0]) | set([thispage - 1, thispage, thispage + 1
                                        ]) | set([maxpage])
        if 0 == thispage:
            pages_to_show.remove(thispage - 1)
        if maxpage == thispage:
            pages_to_show.remove(thispage + 1)
        pages = []

        class Page(object):
            def __init__(self, number, url, selected=False):
                self.number = number
                self.url = url
                self.selected = selected

        pages_in_order = list(pages_to_show)
        pages_in_order.sort()
        for page in pages_in_order:
            if len(pages) > 0 and page != pages[-1].number:
                pages.append('...')
            pages.append(
                Page(page + 1, page_url(page * PAGESIZE), page == thispage))

        error_info = self.request.redis_conn.hgetall(
            "error_page:%s:%s" % (
                self.request.mission.name,
                'no_search_results',
            ), )
        if not error_info:
            error_info = {}
        if error_info.has_key('classic_moment_quote'):
            error_quote = LogLine(
                self.request.redis_conn, self.request.mission.main_transcript,
                timestamp_to_seconds(error_info['classic_moment_quote']))
        else:
            error_quote = None

        return {
            'log_lines': log_lines,
            'result': results,
            'q': q,
            'previous_page': previous_page,
            'next_page': next_page,
            'pages': pages,
            'debug': {
                'query': query,
            },
            'error': {
                'info': error_info,
                'quote': error_quote,
            }
        }
コード例 #17
0
ファイル: issue72.py プロジェクト: anhnguyendepocen/flaxcode
 def testFileTypeSearch(self):
     conn = xappy.SearchConnection(self.col.dbpath())
     res = conn.search(conn.query_field('filetype', 'htm'), 0, 10)
     results = [r for r in res]
     self.assertEqual(len(results), 1)