def testParseMeta(self):
     ''' test parsing header into meta dictionary '''
     input = StringIO.StringIO(' header1 : value1 \nHEADER2:value2\n\n')
     meta, content = distillparse.parseDistillML(input)
     self.assertEqual(2, len(meta))
     self.assertEqual('value1', meta['header1'])         # extra space should be trimmed
     self.assertEqual('value2', meta['header2'])         # header would be turned into lower case
    def indexDoc(self, path):

        fp = file(path,'rb')
        try:
            meta, content = distillparse.parseDistillML(fp, distillparse.writeHeader)
            uri = meta['uri']                               # if there is no uri, throw an exception and discard this doc

            # check index to see if document already indexed
            result = self._searchForArchived(uri, meta)
            if result:
                log.info('discard %s archived(%s) - %s' % (os.path.split(path)[1], result, uri))
                return False

            # add this document in the archive
            fp.seek(0)
            id = docarchive.idCounter.getNewId()
            self.arcHandler.add_document(id, fp)

            # add this document into the index
            self.writer.addDocument(id, meta, content)

            # remember it in freshly added document
            # note if there are existing uri, it will be overwritten by the new one
            self.freshdocs[uri] = meta

            log.info('%s -> %s' % (os.path.split(path)[1], id))

        finally:
            fp.close()

        return True
def reindex(dbdoc, beginId, endId, index_path):

    ah = docarchive.ArchiveHandler('r')

    writer = lucene_logic.Writer(index_path)
    writer.writer.minMergeDocs = 1000

    for i in xrange(beginId, endId):

        docid = '%09d' % i
        if i % NOTIFY_INTERVAL == 1:
            print '%s Reindexing %09d' % (datetime.datetime.now(), i)

        zfile, filename = ah._open(docid)
        try:
            data = zfile.read(filename)
        except KeyError:
            continue        # skip holes

        fp = StringIO.StringIO(data)
        meta, content = distillparse.parseDistillML(fp, distillparse.writeHeader)
        writer.addDocument(docid, meta, content)

    print '%s optimizing' % datetime.datetime.now()
    writer.optimize()
    writer.close()

    ah.close()
    def testParseTagSpanBuffer(self):

        header = '\n'                                                   # empty header
                                          # |123456789|123456789|       # tags span buffer boundary of 10
                                          # |         |         |
        input = StringIO.StringIO(header + 'abcdef<item>ghijk</li>lmn')
        meta, content = distillparse.parseDistillML(input, bufsize=10)          # bufsize of 10

        self.assertEqual('abcdef<item>ghijklmn', content)
    def _get_snapshot_content(self, item):
        # TODO: refactor
        filename = item.id == -1 and '_.mhtml' or '%s.mhtml' % item.id
        spath = cfg.getpath('weblibsnapshot')/filename
        if not spath.exists():
            return ''

        fp = spath.open('rb')       # TODO: check file exist, move to weblib? getSnapshotFile()?
        lwa = mhtml.LoadedWebArchive(fp)
        resp = lwa.fetch_uri(lwa.root_uri)
        if not resp:
            return ''

        # TODO: lucene_logic: use to docid is confusing with lucene's internal docid?
        # TODO: mind content-type, encoding, framed objects??
        data = resp.read()
        meta = {}
        contentBuf = StringIO.StringIO()
        result = distillML.distill(resp, contentBuf, meta=meta)
        contentBuf.seek(0)
        # TODO: what's the deal with writeHeader?
        meta, content = distillparse.parseDistillML(contentBuf, writeHeader=None)
        return content
        self.docid      = self.doc.get('docid'      )
        self.date       = self.doc.get('date'       )
        self.uri        = self.doc.get('uri'        )

        # title & description are filled at hightlight()


    def highlight(self, analyzer, highlighter):
        maxNumFragmentsRequired = 2
        try:
            fp = docarchive.get_document(self.docid)
        except Exception, e:
            # maybe the index is outdate to refer to some non-exist file
            log.exception('Unable to get "%s"' % self.docid)
        else:
            meta, content = distillparse.parseDistillML(fp)
            tokenStream = analyzer.tokenStream('content', StringIO.StringIO(content))
            self.description = highlighter.getBestFragments(tokenStream, content, maxNumFragmentsRequired, "...")
            self.title = meta.get('title','')



def parseQuery(phrase):
    query = QueryParser.parse(phrase, "content", StandardAnalyzer())
    return query


MAXRESULT = 1000

def sortHits(hits, maxDoc):
    """ Return list of (adj score, id, doc, original score) """
    def testParseTags(self):
        header = '\n'                                       # empty header
        input = StringIO.StringIO(header + '<item><h1>*</h1></item>')
        meta, content = distillparse.parseDistillML(input)

        self.assertEqual('<item>*</item>', content)         # <h1> stripped, <item> stays
 def testParse0(self):
     ''' test parsing a minimal file '''
     input = StringIO.StringIO('\n')                     # with an empty header
     meta, content = distillparse.parseDistillML(input)
     self.assertEqual(0, len(meta))
     self.assertEqual('', content)
 def testParse00(self):
     ''' test parsing a empty file (invalid without the header section) '''
     input = StringIO.StringIO('')
     meta, content = distillparse.parseDistillML(input)
     self.assertEqual(0, len(meta))
     self.assertEqual('', content)