/
indexing.py
62 lines (57 loc) · 2.34 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# coding=utf8
import os, sys, glob, codecs
from pyquery import PyQuery as pq
from pltk.PerStemmer import PerStemmer
import lucene
from lucene import SimpleFSDirectory,\
System,\
File,\
Document,\
Field,\
StandardAnalyzer,\
IndexWriter,\
Version
""" Indexing Class
"""
class Indexing():
def __init__( self, doc_dir = './data/corpus', index_dir = './data/index-dir' ):
self.DOC_DIR = doc_dir
self.INDEX_DIR = index_dir
self.stemmer = PerStemmer()
def stem( self, token ):
return self.stemmer.stemText(token)
def index( self ):
lucene.initVM()
indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
analyzer = StandardAnalyzer( Version.LUCENE_30 )
index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
# read input files (.xml)
for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
corpus = codecs.open( in_file, encoding='utf-8' ).read()
d = pq( corpus, parser='html' )
for text in d( 'Article' ).items():
document = Document()
# find ID
art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
# find Title
art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
# find Abstract
art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
# find Keyword
art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
# find Content
art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
# find Authors
art_authors = text.find( 'Authors' ).html().encode('utf-8')
document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
Field.Store.YES,\
Field.Index.ANALYZED ) )
index_writer.addDocument( document )
index_writer.optimize()
index_writer.close()