def __init__( self, msg,filename ): Document.__init__( self ) sender = msg.get('from', '').decode('utf8', 'replace') self.add( Field.Text( 'from', sender ) ) subject = msg.get('subject', '').decode('utf8', 'replace') self.add( Field.Text( 'subject', subject ) ) id = msg.get('Message-ID', '').decode('utf8', 'replace') self.add( Field.Keyword( 'id', id ) ) #date = strftime( '%Y%m%d%H%M%S', strptime(msg.get('Date', '').decode('utf8', 'replace')) ) #self.add( Field.Keyword( 'date', date ) ) body = [] for part in msg.walk(): typ = part.get_type() if typ and typ.lower() == "text/plain": try: charset = part.get_charsets()[0] except: pass if not charset: charset = 'utf8' # Found the first text/plain part bdy = part.get_payload(decode=True) try: bdy = bdy.decode(charset, 'replace') except LookupError: sys.stderr.write("charset lookup error in %s\n" % filename) continue body.append(bdy) # no body found, probably not a multipart msg if not body and not msg.is_multipart(): body = [msg.get_payload().decode('utf8', 'replace')] if not body: sys.stderr.write("no body for %s\n" % filename) body = '\n\n'.join(body) self.add( Field.Text( 'body', body ) ) self.add( Field.Text( 'all', sender + subject + body ) )
def getDocumentFromFOAF(foaf): doc = Document() for attr, value in foaf.iteritems(): if ( fields.has_key(attr)): # Now is always a list! for x in value: if isinstance(x, tuple): #for example (sha, uri) doc.add(Field(attr, x[1], fields[attr][0], fields[attr][1])) else: doc.add(Field(attr, x, fields[attr][0], fields[attr][1])) else: pass # DEBUG information print "E: Field " + attr + " ignored in index" return doc
def indexReader(self, indexWriter, reader, owner, attribute, version): doc = Document() doc.add(Field("owner", owner.str16(), True, False, False)) doc.add(Field("attribute", attribute, True, False, False)) doc.add(Field("version", str(version), True, False, False)) doc.add(Field.Text("contents", reader)) indexWriter.addDocument(doc)
def __init__( self, msg ): Document.__init__( self ) sender = msg.getheader('From') self.add( Field.Text( 'from', sender ) ) subject = msg.getheader( 'Subject' ) self.add( Field.Text( 'subject', subject ) ) body = msg.fp.read() self.add( Field.Text( 'body', body ) ) id = msg.getheader('Message-ID') self.add( Field.Keyword( 'id', id ) ) date = strftime( '%Y%m%d%H%M%S', strptime(msg.getheader('Date')) ) self.add( Field.Keyword( 'date', date ) ) self.add( Field.Text( 'all', sender + subject + body ) )
def indexArticle(pmid, text): res = p.split(text) i = 1 for r in res[1::2]: span_id = r span_text = res[i + 1] try: doc = Document() doc.add( Field("span_id", span_id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add( Field("pmid", pmid, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add( Field("text", span_text, Field.Store.YES, Field.Index.TOKENIZED)) addAnnotations(doc, span_id) writer.addDocument(doc) except Exception, e: sys.stderr.write("error: %s pmid: %s span_id: %s\n" % (e, pmid, span_id)) i += 2
def indexArticle(pmid, text): res = p.split(text) i = 1 for r in res[1::2]: span_id = r span_text = res[i+1] try: doc = Document() doc.add(Field("span_id", span_id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("pmid", pmid, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field("text", span_text, Field.Store.YES, Field.Index.TOKENIZED)) addAnnotations(doc, span_id) writer.addDocument(doc) except Exception, e: sys.stderr.write("error: %s pmid: %s span_id: %s\n" % (e, pmid, span_id)) i += 2
def create_entry_documents(self, feed): docs = [] for entry in feed.get_entries(): try: doc = Document() id = '%s:%s' % (feed.xml_url, entry.get('id', None)) doc.add( Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add( Field('feed_url', feed.xml_url, Field.Store.YES, Field.Index.UN_TOKENIZED)) if entry.get('title', None): doc.add( Field('title', entry['title'], Field.Store.YES, Field.Index.TOKENIZED)) if entry.get('summary', None): doc.add( Field('summary', entry['summary'], Field.Store.YES, Field.Index.TOKENIZED)) if entry.get('link', None): doc.add( Field('link', entry['link'], Field.Store.YES, Field.Index.UN_TOKENIZED)) try: updated = parser.parse(entry.get('updated', None), ignoretz=True) doc.add( Field('updated', updated.isoformat(' '), Field.Store.YES, Field.Index.NO)) except: {} try: doc.add( Field('pickle', pickle.dumps(entry), Field.Store.YES, Field.Index.NO)) except Exception, e: logging.error('Unable to store pickled entry: %s' % e) docs.append(doc) except Exception, e: logging.error(e)
def create_feed_document(self, feed): doc = Document() doc.add( Field('id', str(feed.id), Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add( Field('url', feed.xml_url, Field.Store.YES, Field.Index.UN_TOKENIZED)) if feed.channel_link: doc.add( Field('link', feed.channel_link, Field.Store.YES, Field.Index.UN_TOKENIZED)) if feed.title: doc.add( Field('title', feed.title, Field.Store.YES, Field.Index.TOKENIZED)) if feed.subtitle: doc.add( Field('subtitle', feed.subtitle, Field.Store.YES, Field.Index.TOKENIZED)) return doc
def create_entry_documents(self, feed): docs = [] for entry in feed.get_entries(): try: doc = Document() id = '%s:%s' % (feed.xml_url, entry.get('id', None)) doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field('feed_url', feed.xml_url, Field.Store.YES, Field.Index.UN_TOKENIZED)) if entry.get('title', None): doc.add(Field('title', entry['title'], Field.Store.YES, Field.Index.TOKENIZED)) if entry.get('summary', None): doc.add(Field('summary', entry['summary'], Field.Store.YES, Field.Index.TOKENIZED)) if entry.get('link', None): doc.add(Field('link', entry['link'], Field.Store.YES, Field.Index.UN_TOKENIZED)) try: updated = parser.parse(entry.get('updated', None), ignoretz=True) doc.add(Field('updated', updated.isoformat(' '), Field.Store.YES, Field.Index.NO)) except:{} try: doc.add(Field('pickle', pickle.dumps(entry), Field.Store.YES, Field.Index.NO)) except Exception, e: logging.error('Unable to store pickled entry: %s' % e) docs.append(doc) except Exception, e: logging.error(e)
def create_feed_document(self, feed): doc = Document() doc.add(Field('id', str(feed.id), Field.Store.YES, Field.Index.UN_TOKENIZED)) doc.add(Field('url', feed.xml_url, Field.Store.YES, Field.Index.UN_TOKENIZED)) if feed.channel_link: doc.add(Field('link', feed.channel_link, Field.Store.YES, Field.Index.UN_TOKENIZED)) if feed.title: doc.add(Field('title', feed.title, Field.Store.YES, Field.Index.TOKENIZED)) if feed.subtitle: doc.add(Field('subtitle', feed.subtitle, Field.Store.YES, Field.Index.TOKENIZED)) return doc