Python UnprocessedDocument Beispiele, xappy.UnprocessedDocument Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: PubMedXapian.py Projekt: predictlytechlabs/PubMedPortable

    def __buildDoc(self, article):
        if article.getTitle() == None: return None

        doc = xappy.UnprocessedDocument()
        doc.fields.append(xappy.Field("title", article.getTitle()))
        if article.getAbstract() == None:
            pass
        else:
            doc.fields.append(xappy.Field("text", article.getAbstract()))

        #'INDEX_EXACT' - maximum length 220, but prefix "XA" is added to each term in the document
        #maximum length 218
        for chemical in [
                chemical for chemical in article.getChemicals()
                if len(chemical) < 219
        ]:
            doc.fields.append(xappy.Field("chemical_exact", chemical))

        for keyword in article.getKeywords():
            doc.fields.append(xappy.Field("keyword", keyword))

        for mesh in article.getMeSH():
            doc.fields.append(xappy.Field("mesh", mesh))

        doc.id = str(article.getPMID())
        return doc

Beispiel #2

0

Datei anzeigen

    def document(self, connection, retry=False):
        """
        return a xapian index document from the context.

        we can introspect the connection to discover relevant fields available.
        """
        doc = xappy.UnprocessedDocument()

        if interfaces.ENABLE_LOGGING:
            log.debug("Indexing Document %r" % self.context)

        # object type
        doc.fields.append(
            xappy.Field("object_type", self.context.__class__.__name__))

        # object kind
        doc.fields.append(
            xappy.Field("object_kind",
                        domain.object_hierarchy_type(self.context)))

        try:
            #TODO: loop thru all available languages and index the translations
            self.index(doc)
        except exceptions.OperationalError, exceptions.InvalidRequestError:
            # detatch the dbapi connection from the pool, and close it
            # and retry the index operation (once)
            log.error("Indexing Connection Hosed, Discarding")
            db_connection = metadata.bind.contextual_connect()
            db_connection.begin().rollback()
            db_connection.detach()
            db_connection.close()
            if not retry:
                return self.document(connection, retry=True)
            raise

Beispiel #3

0

Datei anzeigen

    def build_index(self, remove_old=True):

        if remove_old:
            remove_directory(self.search_db_dir)

        self.__xappy = xappy.IndexerConnection(self.search_db_dir)

        self.__xappy.add_field_action("module_uid",
                                      xappy.FieldActions.STORE_CONTENT)

        self.__xappy.add_field_action("keyword_term",
                                      xappy.FieldActions.INDEX_FREETEXT,
                                      nopos=True)

        for module_keyword in self.__keywords:
            for keyword in module_keyword[2]:
                module_doc = xappy.UnprocessedDocument()

                module_doc.fields.append(xappy.Field("module_uid", keyword[0]))

                terms = list(split_word(keyword[1], True))
                module_doc.fields.append(
                    xappy.Field("keyword_term", ' '.join(terms)))

                self.__xappy.add(module_doc)

        self.__xappy.close()

Beispiel #4

0

Datei anzeigen

Datei: index.py Projekt: mohalfaki/bungeni-portal

    def document(self, connection, retry=False):
        """
        return a xapian index document from the context.

        we can introspect the connection to discover relevant fields available.
        """
        doc = xappy.UnprocessedDocument()

        if interfaces.ENABLE_LOGGING:
            log.debug("Indexing Document %r"%self.context)

        # object type
        doc.fields.append(
            xappy.Field("object_type", self.context.__class__.__name__))

        # object kind
        doc.fields.append(
            xappy.Field("object_kind", domain.object_hierarchy_type(self.context)))
        
        # object language
        doc.fields.append(
            xappy.Field("language", self.context.language))
        
        doc.fields.append(xappy.Field("status", getattr(self.context, "status", "")))
        
        doc.fields.append(xappy.Field("owner", str(getattr(self.context, "owner_id", ""))))
        
        try:
            status_date = getattr(self.context, "status_date")
            if status_date:
                status_date = date_value(status_date)
                
            doc.fields.append(xappy.Field("status_date", status_date))
        except Exception:
            pass    
        
        title = ""
        try:
            title = bungeni.ui.search.ISearchResult(self.context).title
        except Exception:
            pass
        
        doc.fields.append(xappy.Field("title", title))
            
        try:
            #TODO: loop thru all available languages and index the translations
            self.index(doc)
            
        except exceptions.OperationalError, exceptions.InvalidRequestError:
            # detatch the dbapi connection from the pool, and close it
            # and retry the index operation (once)
            log.error("Indexing Connection Hosed, Discarding")
            db_connection = metadata.bind.contextual_connect()
            db_connection.begin().rollback()
            db_connection.detach()
            db_connection.close()
            if not retry:
                return self.document(connection, retry=True)
            raise

Beispiel #5

0

Datei anzeigen

Datei: search.py Projekt: azizur77/electionspot

 def add_to_index(data):
     doc = xappy.UnprocessedDocument()
     doc.id = data.id
     for k, v in data.items():
         doc.fields.append(xappy.Field(k, v))
     doc = indexer.process(doc)
     doc.data = data
     indexer.replace(doc)

Beispiel #6

0

Datei anzeigen

Datei: bench.py Projekt: dongshige/wikidpad

 def index_document(self, conn, d):
     if hasattr(self.bench, "process_document_xappy"):
         self.bench.process_document_xappy(d)
     doc = xappy.UnprocessedDocument()
     for key, values in d:
         if not isinstance(values, list):
             values = [values]
         for value in values:
             doc.fields.append(xappy.Field(key, value))
     conn.add(doc)

Beispiel #7

0

Datei anzeigen

Datei: xappy_backend.py Projekt: anhnguyendepocen/flaxcode

        def perform(self):
            updoc = xappy.UnprocessedDocument()
            for k, v in self.doc.iteritems():
                if isinstance(v, list):
                    for v2 in v:
                        updoc.append(k, v2)
                else:
                    updoc.append(k, v)

            if self.docid is not None:
                updoc.id = self.docid
                self.db_writer.iconn.replace(updoc)
            else:
                self.db_writer.iconn.add(updoc)

Beispiel #8

0

Datei anzeigen

 def _process_file(self, file_name, conn, collection_name, filter_settings):
     """ Extract text from a file, make a xapian document and add
     it to the database. Return True if complete succesfully, False
     otherwise.
     """
     get_remote_log().debug("Indexing collection %s: processing file: %s" %
                            (collection_name, file_name))
     unused, ext = os.path.splitext(file_name)
     ext = ext.lower()
     if self.stale(file_name, conn):
         filter = self._find_filter(filter_settings[ext[1:]])
         if filter:
             get_remote_log().debug("Filtering file %s using filter %s" %
                                    (file_name, filter))
             fixed_fields = (
                 ("filename", file_name),
                 ("nametext", os.path.basename(file_name)),
                 ("filepathtext", file_name),
                 ("filetype", os.path.splitext(file_name)[1][1:]),
                 ("collection", collection_name),
                 ("mtime", str(os.path.getmtime(file_name))),
                 ("size", str(os.path.getsize(file_name))),
             )
             for field, value in fixed_fields:
                 assert (field in dbspec.internal_fields())
             try:
                 filtered_blocks = itertools.ifilter(
                     self._accept_block, filter(file_name))
                 fields = itertools.starmap(
                     xappy.Field,
                     itertools.chain(fixed_fields, filtered_blocks))
                 doc = xappy.UnprocessedDocument(fields=fields)
                 doc.id = file_name
                 conn.replace(
                     doc
                 )  # FIXME - if this raises an error, it's probably more serious (eg, database corruption) than if a filter raises an error.
                 get_remote_log().debug(
                     "Added (or replaced) doc %s to collection %s with text from source file %s"
                     % (doc.id, collection_name, file_name))
                 return True
             except Exception, e:
                 get_remote_log().error(
                     "Filtering file: %s with filter: %s exception %s(%s), skipping"
                     % (file_name, filter, type(e).__name__, str(e)))
                 return False
         else:
             get_remote_log().warn(
                 "Filter for %s is not valid, not filtering file: %s" %
                 (ext, file_name))
             return False

Beispiel #9

0

Datei anzeigen

Datei: se.py Projekt: ChunHungLiu/watchdog-1

def load_pols(iconn):
    pols = schema.Politician.select()
    for p in pols:
        doc = xappy.UnprocessedDocument()
        for format in nameformats.split('\n'):
            text = replacetable(
                format,
                dict(first=p.firstname,
                     middle=p.middlename,
                     last=p.lastname,
                     nickname=p.nickname))
            index(doc, 'name', text)
        index(doc, 'name', p.id.replace('_', ' '))
        index(doc, 'id', p.id)
        iconn.add(doc)

Beispiel #10

0

Datei anzeigen

Datei: latest_version_mapper.py Projekt: yograterol/fedora-packages

 def update_timestamp(self, timestamp):
     doc = self.get_timestamp_doc()
     if doc:
         doc._doc.set_data(str(timestamp))
         self.iconn.replace(doc)
         self.iconn.flush()
     else:
         doc = xappy.UnprocessedDocument()
         doc.fields.append(xappy.Field('key', '_last_run_'))
         processed_doc = self.iconn.process(doc, False)
         processed_doc._doc.set_data(str(timestamp))
         # preempt xappy's processing of data
         processed_doc._data = None
         self.iconn.add(processed_doc)
         self.iconn.flush()

Beispiel #11

0

Datei anzeigen

Datei: searchmachine.py Projekt: v-komarov/kistmc

def	MakeIndex():

    connection = xappy.IndexerConnection('kis/lib/data')

    cursor = connections['default'].cursor()
    cursor.execute("SELECT rec_id,name FROM t_show_store_eisup_list;")
    data = cursor.fetchall()

    for item in data:
	doc = xappy.UnprocessedDocument()
	doc.fields.append(xappy.Field('kod',item[0].encode('utf-8')))
	doc.fields.append(xappy.Field('name',item[1].encode('utf-8')))
	connection.add(doc)

    connection.flush()
    connection.close()

Beispiel #12

0

Datei anzeigen

def index(items, doc_type, create=False):
    indexer = IndexerContext(settings.XAPIAN_DB)
    if create:
        with indexer as conn:
            create_index(conn)

    preprocess_text = lambda t: normalize_text(t).lower()

    with indexer as conn:
        n = 0
        for n, (key, data) in enumerate(items, 1):
            doc = xappy.UnprocessedDocument(key)
            doc.append('type', doc_type)
            for field in TEXT_FIELDS:
                val = data.get(field, '')
                if val:
                    doc.append(field, preprocess_text(val))
            for field in EXACT_FIELDS:
                val = data.get(field, '')
                if field == 'date' and val:
                    val = val.partition(' ')[0]
                    if not val.count('-') == 2:
                        val = None
                if val:
                    doc.append(field, val)

            for field, kwargs in SORTABLE_FIELDS:
                val = data.get(field)
                if not val:
                    continue
                doc.append(field, val, **kwargs)

            for field in FACET_FIELDS:
                val = data.get(field)
                if not val:
                    continue
                doc.append(field, val)

            for field in COLLAPSE_FIELDS:
                val = data.get(field)
                if not val:
                    continue
                doc.append(field, val)

            conn.add(doc)
        return n

Beispiel #13

0

Datei anzeigen

Datei: indexer.py Projekt: niksbiks/Spacelog

 def add_to_search_index(self, mission, id, chunk, weight, timestamp):
     """
     Take some text and a set of speakers (also text) and add a document
     to the search index, with the id stuffed in the document data.
     """
     lines = chunk['lines']
     doc = xappy.UnprocessedDocument()
     doc.fields.append(xappy.Field("mission", mission))
     doc.fields.append(xappy.Field("weight", weight))
     doc.fields.append(xappy.Field("transcript", self.transcript_name))
     for line in lines:
         text = re.sub(
             r"\[\w+:([^]]+)\|([^]]+)\]",
             lambda m: m.group(2),
             line['text'],
         )
         text = re.sub(
             r"\[\w+:([^]]+)\]",
             lambda m: m.group(1),
             text,
         )
         # also strip tags from text, because they're lame lame lame
         text = strip_tags(text)
         doc.fields.append(xappy.Field("text", text))
         # grab the character to get some more text to index under speaker
         ch = self.characters.get(line['speaker'], None)
         if ch:
             ch2 = ch.current_shift(timestamp)
             doc.fields.append(
                 xappy.Field("speaker_identifier", ch2.identifier))
             doc.fields.append(xappy.Field("speaker", ch2.short_name))
             doc.fields.append(xappy.Field("speaker", ch.short_name))
         else:
             doc.fields.append(
                 xappy.Field("speaker_identifier", line['speaker']))
             doc.fields.append(xappy.Field("speaker", line['speaker']))
     doc.id = id
     try:
         search_db.replace(search_db.process(doc))
     except xappy.errors.IndexerError:
         print "umm, error"
         print id, lines
         raise

Beispiel #14

0

Datei anzeigen

    def _get_document(self, connection, doc_id, mtime, mode):
        do_index = False

        if mode == 'update':
            try:
                doc = connection.get_document(doc_id)
                docmtime = long(doc.data['mtime'][0])
            except KeyError:
                do_index = True
            else:
                do_index = mtime > docmtime
        elif mode == 'add':
            do_index = True
        else:
            raise ValueError("mode must be 'update' or 'add'")

        if do_index:
            document = xappy.UnprocessedDocument()
            document.id = doc_id
        else:
            document = None
        return document

Beispiel #15

0

Datei anzeigen

 def _factory(db, doc):
     ixdoc = xappy.UnprocessedDocument()
     ixdoc.id = doc['_id']
     for D in i:
         for data in D['data']:
             data, num_items = expand(data, doc)
             for n in xrange(num_items):
                 if 'factory' in D:
                     out = import_func(D['factory'])(doc)
                     if isinstance(out, ListType):
                         for index_text in out:
                             print 'INDEX_TEXT', index_text
                             ixdoc.fields.append(
                                 xappy.Field(D['name'], index_text))
                     else:
                         index_text = out
                         print 'INDEX_TEXT', index_text
                         ixdoc.fields.append(
                             xappy.Field(D['name'], index_text))
                 else:
                     index_text = (data % {'n': n}) % api.dotted(doc)
                     print 'INDEX_TEXT', index_text
                     ixdoc.fields.append(xappy.Field(D['name'], index_text))
     return ixdoc

Beispiel #16

0

Datei anzeigen

Datei: index.py Projekt: jasontibbitts/fedora-packages

    def index_pkgs(self):
        yum_pkgs = self.index_yum_pkgs()
        pkg_count = 0

        for pkg in yum_pkgs.values():
            pkg_count += 1

            doc = xappy.UnprocessedDocument()
            filtered_name = filter_search_string(pkg['name'])
            filtered_summary = filter_search_string(pkg['summary'])
            filtered_description = filter_search_string(pkg['description'])

            if pkg['name'] != filtered_name:
                print("%d: indexing %s as %s" %
                      (pkg_count, pkg['name'], filtered_name))
            else:
                print("%d: indexing %s" % (pkg_count, pkg['name']))

            doc.fields.append(
                xappy.Field('exact_name',
                            'EX__' + filtered_name + '__EX',
                            weight=10.0))

            name_parts = filtered_name.split('_')
            for i in range(20):
                if len(name_parts) > 1:
                    for part in name_parts:
                        doc.fields.append(xappy.Field('name', part,
                                                      weight=1.0))
                doc.fields.append(
                    xappy.Field('name', filtered_name, weight=10.0))

            for i in range(4):
                doc.fields.append(
                    xappy.Field('summary', filtered_summary, weight=1.0))
            doc.fields.append(
                xappy.Field('description', filtered_description, weight=0.2))

            self.index_files(doc, pkg)
            self.index_tags(doc, pkg)

            for sub_pkg in pkg['sub_pkgs']:
                pkg_count += 1
                filtered_sub_pkg_name = filter_search_string(sub_pkg['name'])
                if filtered_sub_pkg_name != sub_pkg['name']:
                    print("%d:    indexing subpkg %s as %s" %
                          (pkg_count, sub_pkg['name'], filtered_sub_pkg_name))
                else:
                    print("%d:    indexing subpkg %s" %
                          (pkg_count, sub_pkg['name']))

                doc.fields.append(
                    xappy.Field('subpackages',
                                filtered_sub_pkg_name,
                                weight=1.0))
                doc.fields.append(
                    xappy.Field('exact_name',
                                'EX__' + filtered_sub_pkg_name + '__EX',
                                weight=10.0))

                self.index_files(doc, sub_pkg)
                self.index_tags(doc, sub_pkg)
                if sub_pkg['icon'] != self.default_icon and pkg[
                        'icon'] == self.default_icon:
                    pkg['icon'] = sub_pkg['icon']

                # remove anything we don't want to store
                del sub_pkg['pkg']

            # @@: Right now we're only indexing the first part of the
            # provides/requires, and not boolean comparison or version
            #for requires in pkg.requires:
            #    print requires[0]
            #    doc.fields.append(xappy.Field('requires', requires[0]))
            #for provides in pkg.provides:
            #    doc.fields.append(xappy.Field('provides', provides[0]))

            # remove anything we don't want to store and then store data in
            # json format
            del pkg['pkg']
            del pkg['src_pkg']

            processed_doc = self.iconn.process(doc, False)
            processed_doc._doc.set_data(json.dumps(pkg))
            # preempt xappy's processing of data
            processed_doc._data = None
            self.iconn.add(processed_doc)

        self.icon_cache.close()

        return pkg_count

Beispiel #17

0

Datei anzeigen

Datei: indexer.py Projekt: jakebarnwell/PythonGenerator

 def __init__(self, resource):
     self.resource = resource
     self.doc = xappy.UnprocessedDocument()
     log.debug("indexing %s - %s", self.type, self.resource)

Beispiel #18

0

Datei anzeigen

Datei: index.py Projekt: yashvardhannanavati/fedora-packages

    def _create_document(self, package):
        doc = xappy.UnprocessedDocument()
        filtered_name = filter_search_string(package['name'])
        filtered_summary = filter_search_string(package['summary'])
        filtered_description = filter_search_string(package['description'])

        doc.fields.append(
            xappy.Field('exact_name',
                        'EX__' + filtered_name + '__EX',
                        weight=10.0))

        name_parts = filtered_name.split('_')
        for i in range(20):
            if len(name_parts) > 1:
                for part in name_parts:
                    doc.fields.append(xappy.Field('name', part, weight=1.0))
            doc.fields.append(xappy.Field('name', filtered_name, weight=10.0))

        for i in range(4):
            doc.fields.append(
                xappy.Field('summary', filtered_summary, weight=1.0))
        doc.fields.append(
            xappy.Field('description', filtered_description, weight=0.2))

        self.index_files_of_interest(doc, package)
        self.index_tags(doc, package)

        for sub_package in package['sub_pkgs']:
            filtered_sub_package_name = filter_search_string(
                sub_package['name'])
            log.info("       indexing subpackage %s" % sub_package['name'])

            doc.fields.append(
                xappy.Field('subpackages',
                            filtered_sub_package_name,
                            weight=1.0))
            doc.fields.append(
                xappy.Field('exact_name',
                            'EX__' + filtered_sub_package_name + '__EX',
                            weight=10.0))

            self.index_files_of_interest(doc, sub_package)

            # fedora-tagger does not provide special tags for sub-packages...
            #self.index_tags(doc, sub_package)

            # Set special sub-package icon if appstream has one
            sub_package['icon'] = self.icon_cache.get(sub_package['name'],
                                                      self.default_icon)

            # If the parent has a dull icon, give it ours!
            if sub_package['icon'] != self.default_icon \
                and package['icon'] == self.default_icon:
                package['icon'] = sub_package['icon']

            # remove anything we don't want to store
            del sub_package['package']

        # @@: Right now we're only indexing the first part of the
        # provides/requires, and not boolean comparison or version
        #for requires in package.requires:
        #    print requires[0]
        #    doc.fields.append(xappy.Field('requires', requires[0]))
        #for provides in package.provides:
        #    doc.fields.append(xappy.Field('provides', provides[0]))

        # remove anything we don't want to store and then store data in
        # json format
        del package['package']

        return doc

Beispiel #19

0

Datei anzeigen

 # get all documents in the currently selected journal directory
 files = os.listdir(recent_filePath)
 # store current indexing ID (used in ids.txt)
 recent_xapianPath = str(counter)
 # open a new file connection to create a Xapian index
 conn = xappy.IndexerConnection(os.path.join(xapianPath, recent_xapianPath))
 # create field to store the full texts
 conn.add_field_action('text',
                       xappy.FieldActions.INDEX_FREETEXT,
                       language='en')
 if not use_psql:
     # create a data field to store the full text in it, e.g. while iterating over search results
     conn.add_field_action('text', xappy.FieldActions.STORE_CONTENT)
 # iterate over all journal directories
 for file_name in files:
     doc = xappy.UnprocessedDocument()
     f = open(os.path.join(recent_filePath, file_name), "r")
     text = f.read()
     f.close()
     doc.fields.append(xappy.Field("text", text))
     try:
         file_name = "'" + file_name + "'"
         pmcid = str(get_PMC(file_name))
         doc.id = pmcid
         if use_psql:
             insert_ID_and_text(pmcid, text)
     except:
         # possibly duplicates (from files_without_pmc.txt - less command shows the same content, although different formatting of line breaks, e.g.:
         # less Biosci_Rep_2012_Dec_1_32\(6\)_549-557.txt
         # contains a PMC ID, but the following file not:
         # less Biosci_Rep_2012_Dec_1_32\(Pt_6\)_549-557.txt

Beispiel #20

0

Datei anzeigen

Datei: index_questions.py Projekt: peterbe/tornado_gkc

def main():
    tornado.options.parse_command_line()

    from apps.main.models import User
    from apps.questions.models import Question, Genre
    from mongokit import Connection
    con = Connection()
    con.register([Question, Genre, User])
    db = con.gkc

    if options.reindex_all:
        since = datetime.datetime(1979, 12, 13)
    else:
        since = options.since
        if not since:
            since = default_since
        try:
            since = datetime.datetime.strptime(since, '%Y-%m-%d %H-%M-%S')
        except ValueError:
            since = datetime.datetime.strptime(since, '%Y-%m-%d')
    if options.verbose:
        print 'since', since

    genres = {}
    authors = {}
    count = 0
    search = {'modify_date': {'$gt': since}}
    if not db.Question.find(search).count():
        if options.verbose:
            print "0 questions"
        if not options.test:
            return
    youngest = since

    indexer = xappy.IndexerConnection(settings.XAPIAN_LOCATION)
    if not indexer.get_fields_with_actions() or options.update_fields:
        indexer.add_field_action('question',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 weight=2,
                                 language='en',
                                 spell=True,
                                 stop=stopwords)
        indexer.add_field_action(
            'answer',
            xappy.FieldActions.INDEX_FREETEXT,
            language='en',
            spell=True,
        )
        indexer.add_field_action('accept',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)
        indexer.add_field_action('alternatives',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=True)
        indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT)
        indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT)
        indexer.add_field_action('comment',
                                 xappy.FieldActions.INDEX_FREETEXT,
                                 language='en',
                                 spell=False,
                                 search_by_default=False,
                                 stop=stopwords)
        indexer.add_field_action('date',
                                 xappy.FieldActions.SORTABLE,
                                 type="date")
        indexer.add_field_action('state', xappy.FieldActions.SORTABLE)

        indexer.add_field_action('question', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('answer', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT)
        indexer.add_field_action('state', xappy.FieldActions.STORE_CONTENT)

    t0 = time.time()
    for question in db.Question.collection.find(search):
        if question['modify_date'] > youngest:
            youngest = question['modify_date']
        doc = xappy.UnprocessedDocument()
        doc.fields.append(xappy.Field('state', question['state']))
        doc.fields.append(xappy.Field('question', question['text']))
        doc.fields.append(xappy.Field('answer', question['answer']))
        if question['genre'].id in genres:
            genre = genres[question['genre'].id]
        else:
            genre = db.Genre.one({'_id': question['genre'].id})
            genre = genre.name
            genres[question['genre'].id] = genre
        doc.fields.append(xappy.Field('genre', genre))
        if question['author'].id in authors:
            author = authors[question['author'].id]
        else:

            author = db.User.one({'_id': question['author'].id})
            author = author.username
            authors[question['author'].id] = author
        doc.fields.append(xappy.Field('author', author))
        doc.fields.append(xappy.Field('comment', question['comment']))
        doc.fields.append(xappy.Field('accept', '\n'.join(question['accept'])))
        doc.fields.append(
            xappy.Field('alternatives', '\n'.join(question['alternatives'])))
        doc.id = str(question['_id'])
        pdoc = indexer.process(doc)
        indexer.replace(pdoc)
        count += 1
        #if count and not count % 100:
        #    indexer.flush()
    # add a second to avoid milliseconds causing the same doc to be index over and over
    youngest += datetime.timedelta(seconds=1)
    open(since_filename, 'w').write(youngest.strftime('%Y-%m-%d %H-%M-%S\n'))

    indexer.flush()
    t1 = time.time()
    indexer.close()
    if options.verbose:
        print round(t1 - t0, 3), "seconds to index", count, "questions"

    # test
    if options.test:
        print settings.XAPIAN_LOCATION
        searcher = xappy.SearchConnection(settings.XAPIAN_LOCATION)
        text = 'FRAMCEs capitalls'
        text = "Capitol STATE"
        print searcher.spell_correct(text)
        query = searcher.query_field('question',
                                     text,
                                     default_op=searcher.OP_OR)
        results = searcher.search(query, 0, 10)
        print results.matches_estimated
        #print results.estimate_is_exact
        for result in results:
            print result.rank, result.id
            print repr(result.summarise('question')), result.data['state'][0]
            #result.data['state']

        text = 'london'
        query = searcher.query_field('answer', text, default_op=searcher.OP_OR)
        results = searcher.search(query, 0, 10)
        print results.matches_estimated
        #print results.estimate_is_exact
        for result in results:
            print result.rank, result.id
            print repr(result.summarise('question')), result.data['state'][0]

Beispiel #21

0

Datei anzeigen

Datei: latest_version_mapper.py Projekt: yograterol/fedora-packages

    def init_db(self, *args):
        """
        loop through all packages and get the latest builds for koji tags
        listed in distmappings
        """
        self.new_timestamp = time.time() - 60
        print "Calculating timestamp minus 1 minute to account for any skew between the servers (%s)" % time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(self.new_timestamp))

        print "Initializing Index"
        package_list = self.koji_client.listPackages()
        i = 0
        for pkg in package_list:
            i += 1
            pkg_name = pkg['package_name']
            print "%d: Processing package %s" % (i, pkg_name)
            name_len = len(pkg_name)

            doc = xappy.UnprocessedDocument()
            filtered_name = filter_search_string(pkg_name)
            doc.fields.append(xappy.Field('key', filtered_name))

            latest_builds = {'name': pkg_name}
            for t in tags:
                tag = t['tag']
                if t['name'] in latest_builds:
                    # short circuit optimization
                    continue

                builds = self.koji_client.getLatestBuilds(tag,
                                                          package=pkg_name)
                if builds:
                    build = None
                    for b in builds:
                        # only get builds which completed
                        if b['state'] == koji.BUILD_STATES['COMPLETE']:
                            build = b
                            break

                    if build:
                        data = {
                            'version': build['version'],
                            'release': build['release'],
                            'build_id': build['build_id']
                        }

                        if build.get('epoch', None) != None:
                            data['epoch'] = str(build['epoch'])
                            version_display = "%s:%s.%s" % (data['epoch'],
                                                            data['version'],
                                                            data['release'])
                        else:
                            version_display = "%s.%s" % (data['version'],
                                                         data['release'])

                        latest_builds[t['name']] = data
                        print "    %s: %s" % (t['name'], version_display)

            if len(latest_builds) < 2:
                # don't process doc if there is no real data
                # most likely this is an outdated package
                continue

            processed_doc = self.iconn.process(doc, False)
            processed_doc._doc.set_data(json.dumps(latest_builds))
            # preempt xappy's processing of data
            processed_doc._data = None
            self.iconn.add(processed_doc)

        print "Finished updating timestamp"
        self.update_timestamp(self.new_timestamp)

Beispiel #22

0

Datei anzeigen

Datei: latest_version_mapper.py Projekt: yograterol/fedora-packages

    def update_db(self, timestamp=None):
        """ ask koji for any changes after we last ran the mapper
            if a timestamp is provided in ISO format ('YYYY-MM-DD HH:MI:SS')
            use that instead
        """

        try:
            timestamp = float(timestamp)
        except (ValueError, TypeError):
            pass

        if not timestamp:
            timestamp = self.get_current_timestamp()
            try:
                timestamp = float(timestamp)
            except (ValueError, TypeError):
                pass

            if not timestamp:
                print "Error: you need to specify a time to update from in ISO format ('YYYY-MM-DD HH:MI:SS') or run init"
                exit(-1)

        self.new_timestamp = time.time() - 60
        print "Calculating timestamp minus 1 minute to account for any skew between the servers (%s)" % time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(self.new_timestamp))

        opts = {
            'completedAfter': timestamp,
            'method': 'tagBuild',
            'decode': True
        }

        if isinstance(timestamp, float):
            display_timestamp = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(timestamp))
        else:
            display_timestamp = timestamp
        print "Getting Task List since %s" % display_timestamp
        task_list = self.koji_client.listTasks(opts=opts)
        print "Updating Index"
        for task in task_list:
            parent_id = task['parent']
            if parent_id:
                builds = self.koji_client.listBuilds(taskID=parent_id)

                if len(builds) < 1:
                    continue

                build = builds[0]

                pkg_tags = self.koji_client.listTags(build['build_id'])
                dist_name = None
                for t in pkg_tags:
                    dist_name = tags_to_name_map.get(t['name'], None)
                    if dist_name:
                        break

                if not dist_name:
                    continue

                pkg_doc = None
                if build['name'] in self.found_packages:
                    pkg_doc = self.found_packages[build['name']]
                else:
                    results = self.search(build['name'])

                    if results:
                        pkg_doc = results[0]

                build_epoch = build.get('epoch', None)
                if build_epoch is not None:
                    build_epoch = str(build_epoch)

                if not pkg_doc:
                    # TODO create new document
                    print "ran into new package %s" % build['name']
                    self.new_packages[build['name']] = True
                    doc = xappy.UnprocessedDocument()
                    filtered_name = filter_search_string(build['name'])
                    doc.fields.append(xappy.Field('key', filtered_name))

                    latest_builds = {'name': build['name']}
                    data = {}
                    if build_epoch is not None:
                        data['epoch'] = build_epoch
                    data['version'] = build['version']
                    data['release'] = build['release']
                    data['build_id'] = build['build_id']
                    latest_builds[dist_name] = data

                    processed_doc = self.iconn.process(doc, False)
                    processed_doc._doc.set_data(json.dumps(latest_builds))
                    # preempt xappy's processing of data
                    processed_doc._data = None
                    self.iconn.add(processed_doc)
                    self.sconn_needs_reload = True
                    self.iconn.flush()
                else:
                    latest_builds = json.loads(pkg_doc._doc.get_data())
                    data = latest_builds.get(dist_name, {
                        'version': '0',
                        'release': '0',
                        'build_id': 0
                    })
                    data_epoch = None
                    do_update = False
                    if 'release' not in data:
                        # do the update because we have old data
                        do_update = True
                    else:
                        data_epoch = data.get('epoch', None)
                        if data_epoch is not None:
                            data_epoch = str(data_epoch)

                        if rpm.labelCompare(
                            (build_epoch, build['version'], build['release']),
                            (data_epoch, data['version'],
                             data['release'])) == 1:
                            do_update = True

                    if do_update:
                        self.updated_packages[build['name']] = True
                        build_vr = ''
                        if build_epoch is not None:
                            build_vr = "%s:%s.%s" % (build_epoch,
                                                     build['version'],
                                                     build['release'])
                        else:
                            build_vr = "%s.%s" % (build['version'],
                                                  build['release'])

                        data_vr = ''
                        if data_epoch is not None:
                            data_vr = "%s:%s.%s" % (data_epoch,
                                                    data['version'],
                                                    data.get('release', ''))
                        else:
                            data_vr = "%s.%s" % (data['version'],
                                                 data.get('release', ''))

                        print "Updating package %s in dist %s to version %s (from %s)" % (
                            build['name'], dist_name, build_vr, data_vr)

                        if build_epoch is not None:
                            data['epoch'] = build_epoch
                        data['version'] = build['version']
                        data['release'] = build['release']
                        data['build_id'] = build['build_id']
                        latest_builds[dist_name] = data

                        pkg_doc._doc.set_data(json.dumps(latest_builds))
                        # preempt xappy's processing of data
                        pkg_doc._data = None
                        self.iconn.replace(pkg_doc)
                        self.sconn_needs_reload = True
                        self.found_packages[build['name']] = pkg_doc
                        self.iconn.flush()

        updated_count = len(self.updated_packages)
        new_count = len(self.new_packages)
        print "Updated: %d packages" % updated_count
        print "  Added: %d packages" % new_count
        print "========================="
        print "  Total: %s" % (updated_count + new_count)

        self.update_timestamp(self.new_timestamp)