コード例 #1
0
    def __init__( self, msg,filename ):
        Document.__init__( self )

        sender = msg.get('from', '').decode('utf8', 'replace')

        self.add( Field.Text( 'from', sender ) )

        subject = msg.get('subject', '').decode('utf8', 'replace')
        self.add( Field.Text( 'subject', subject ) )
        
        
        id = msg.get('Message-ID', '').decode('utf8', 'replace')
        self.add( Field.Keyword( 'id', id ) )

        #date = strftime( '%Y%m%d%H%M%S', strptime(msg.get('Date', '').decode('utf8', 'replace')) )
        #self.add( Field.Keyword( 'date', date ) )

        body = []
        for part in msg.walk():
            typ = part.get_type()
            if typ and typ.lower() == "text/plain":
                try:
                    charset = part.get_charsets()[0]
                except:
                    pass
                if not charset:
                    charset = 'utf8'
                # Found the first text/plain part
                bdy = part.get_payload(decode=True)
                try:
                    bdy = bdy.decode(charset, 'replace')
                except LookupError:
                    sys.stderr.write("charset lookup error in %s\n" % filename)
                    continue
                body.append(bdy)
        
        # no body found, probably not a multipart msg
        if not body and not msg.is_multipart():
            body = [msg.get_payload().decode('utf8', 'replace')]

        if not body:
            sys.stderr.write("no body for %s\n" % filename)
        body = '\n\n'.join(body)
        
                
        self.add( Field.Text( 'body', body ) )

        
        self.add( Field.Text( 'all', sender + subject + body ) )
コード例 #2
0
    def getDocumentFromFOAF(foaf):
        doc = Document()
        for attr, value in foaf.iteritems():
            if ( fields.has_key(attr)):
                # Now is always a list!
                for x in value:
                    if isinstance(x, tuple): #for example (sha, uri)
                        doc.add(Field(attr, x[1], fields[attr][0], fields[attr][1]))
                    else:
                        doc.add(Field(attr, x, fields[attr][0], fields[attr][1]))

            else:
                pass
                # DEBUG information print "E: Field " + attr + " ignored in index"
        return doc
コード例 #3
0
    def indexReader(self, indexWriter, reader, owner, attribute, version):

        doc = Document()
        doc.add(Field("owner", owner.str16(), True, False, False))
        doc.add(Field("attribute", attribute, True, False, False))
        doc.add(Field("version", str(version), True, False, False))
        doc.add(Field.Text("contents", reader))

        indexWriter.addDocument(doc)
コード例 #4
0
    def __init__( self, msg ):
        Document.__init__( self )

        sender = msg.getheader('From')
        self.add( Field.Text( 'from', sender ) )

        subject = msg.getheader( 'Subject' )
        self.add( Field.Text( 'subject', subject ) )

        body = msg.fp.read()
        self.add( Field.Text( 'body', body ) )

        id = msg.getheader('Message-ID')
        self.add( Field.Keyword( 'id', id ) )

        date = strftime( '%Y%m%d%H%M%S', strptime(msg.getheader('Date')) )
        self.add( Field.Keyword( 'date', date ) )

        self.add( Field.Text( 'all', sender + subject + body ) )
コード例 #5
0
ファイル: index_spans.py プロジェクト: sskatoch/TREC-2006
def indexArticle(pmid, text):
    res = p.split(text)
    i = 1
    for r in res[1::2]:
        span_id = r
        span_text = res[i + 1]
        try:
            doc = Document()
            doc.add(
                Field("span_id", span_id, Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            doc.add(
                Field("pmid", pmid, Field.Store.YES, Field.Index.UN_TOKENIZED))
            doc.add(
                Field("text", span_text, Field.Store.YES,
                      Field.Index.TOKENIZED))
            addAnnotations(doc, span_id)
            writer.addDocument(doc)
        except Exception, e:
            sys.stderr.write("error: %s pmid: %s span_id: %s\n" %
                             (e, pmid, span_id))
        i += 2
コード例 #6
0
ファイル: index_spans.py プロジェクト: alexksikes/TREC-2006
def indexArticle(pmid, text):
    res = p.split(text)
    i = 1
    for r in res[1::2]:
        span_id = r
        span_text = res[i+1] 
        try:
            doc = Document()
            doc.add(Field("span_id", span_id,
                          Field.Store.YES, Field.Index.UN_TOKENIZED))
            doc.add(Field("pmid", pmid,
                          Field.Store.YES, Field.Index.UN_TOKENIZED))
            doc.add(Field("text", span_text,
                          Field.Store.YES, Field.Index.TOKENIZED))
            addAnnotations(doc, span_id)
            writer.addDocument(doc)
        except Exception, e:
            sys.stderr.write("error: %s pmid: %s span_id: %s\n" % (e, pmid, span_id))
        i += 2
コード例 #7
0
ファイル: lucene_utils.py プロジェクト: wbornor/feednut
 def create_entry_documents(self, feed):
     docs = []
     for entry in feed.get_entries():
         try:
             doc = Document()
             id = '%s:%s' % (feed.xml_url, entry.get('id', None))
             doc.add(
                 Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED))
             doc.add(
                 Field('feed_url', feed.xml_url, Field.Store.YES,
                       Field.Index.UN_TOKENIZED))
             if entry.get('title', None):
                 doc.add(
                     Field('title', entry['title'], Field.Store.YES,
                           Field.Index.TOKENIZED))
             if entry.get('summary', None):
                 doc.add(
                     Field('summary', entry['summary'], Field.Store.YES,
                           Field.Index.TOKENIZED))
             if entry.get('link', None):
                 doc.add(
                     Field('link', entry['link'], Field.Store.YES,
                           Field.Index.UN_TOKENIZED))
             try:
                 updated = parser.parse(entry.get('updated', None),
                                        ignoretz=True)
                 doc.add(
                     Field('updated', updated.isoformat(' '),
                           Field.Store.YES, Field.Index.NO))
             except:
                 {}
             try:
                 doc.add(
                     Field('pickle', pickle.dumps(entry), Field.Store.YES,
                           Field.Index.NO))
             except Exception, e:
                 logging.error('Unable to store pickled entry: %s' % e)
             docs.append(doc)
         except Exception, e:
             logging.error(e)
コード例 #8
0
ファイル: lucene_utils.py プロジェクト: wbornor/feednut
 def create_feed_document(self, feed):
     doc = Document()
     doc.add(
         Field('id', str(feed.id), Field.Store.YES,
               Field.Index.UN_TOKENIZED))
     doc.add(
         Field('url', feed.xml_url, Field.Store.YES,
               Field.Index.UN_TOKENIZED))
     if feed.channel_link:
         doc.add(
             Field('link', feed.channel_link, Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     if feed.title:
         doc.add(
             Field('title', feed.title, Field.Store.YES,
                   Field.Index.TOKENIZED))
     if feed.subtitle:
         doc.add(
             Field('subtitle', feed.subtitle, Field.Store.YES,
                   Field.Index.TOKENIZED))
     return doc
コード例 #9
0
ファイル: lucene_utils.py プロジェクト: wbornor/feednut
 def create_entry_documents(self, feed):
     docs = []
     for entry in feed.get_entries():
         try:
             doc = Document()
             id = '%s:%s' % (feed.xml_url, entry.get('id', None))
             doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED))
             doc.add(Field('feed_url', feed.xml_url, Field.Store.YES, Field.Index.UN_TOKENIZED))
             if entry.get('title', None):
                 doc.add(Field('title', entry['title'], Field.Store.YES, Field.Index.TOKENIZED))
             if entry.get('summary', None):
                 doc.add(Field('summary', entry['summary'], Field.Store.YES, Field.Index.TOKENIZED))
             if entry.get('link', None):
                 doc.add(Field('link', entry['link'], Field.Store.YES, Field.Index.UN_TOKENIZED))
             try:
                 updated = parser.parse(entry.get('updated', None), ignoretz=True)
                 doc.add(Field('updated', updated.isoformat(' '), Field.Store.YES, Field.Index.NO))
             except:{}
             try:
                 doc.add(Field('pickle', pickle.dumps(entry), Field.Store.YES, Field.Index.NO))
             except Exception, e:
                 logging.error('Unable to store pickled entry: %s' % e)
             docs.append(doc)
         except Exception, e:
             logging.error(e)
コード例 #10
0
ファイル: lucene_utils.py プロジェクト: wbornor/feednut
 def create_feed_document(self, feed):
     doc = Document()
     doc.add(Field('id', str(feed.id), Field.Store.YES, Field.Index.UN_TOKENIZED))
     doc.add(Field('url', feed.xml_url, Field.Store.YES, Field.Index.UN_TOKENIZED))
     if feed.channel_link:
         doc.add(Field('link', feed.channel_link, Field.Store.YES, Field.Index.UN_TOKENIZED))
     if feed.title:
         doc.add(Field('title', feed.title, Field.Store.YES, Field.Index.TOKENIZED))
     if feed.subtitle:
         doc.add(Field('subtitle', feed.subtitle, Field.Store.YES, Field.Index.TOKENIZED))
     return doc