コード例 #1
0
 def indexDocs(self, root, writer):
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             if not filename.endswith('.txt'):
                 continue
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 for line in file:
                     doc = Document()
                     arr = line.split('\t')
                     field = Field("name", arr[2].lower(), Field.Store.YES,
                                   Field.Index.TOKENIZED)
                     field.setBoost(1.5)
                     doc.add(field)
                     doc.add(
                         Field("alternate_names", arr[3].lower(),
                               Field.Store.YES, Field.Index.TOKENIZED))
                     doc.add(
                         Field("state", arr[10].lower(), Field.Store.YES,
                               Field.Index.TOKENIZED))
                     doc.add(
                         Field("population", arr[14], Field.Store.YES,
                               Field.Index.UN_TOKENIZED))
                     if int(arr[14]) > 1000000:
                         doc.setBoost(1.2)
                     writer.addDocument(doc)
                 file.close()
             except Exception, e:
                 print "Failed in indexDocs:", e
コード例 #2
0
def indexDocuments():
    # empty index directory
    indexDir = Wikipedia.directory + 'index/'
    for filename in os.listdir(indexDir):
        os.remove(indexDir + filename)

    # index documents
    lucene.initVM()
    version = Version.LUCENE_CURRENT
    analyzer = EnglishAnalyzer(version)
    writer = IndexWriter(SimpleFSDirectory(File(indexDir)), analyzer, True,
                         IndexWriter.MaxFieldLength.LIMITED)

    for article in Wikipedia():
        doc = Document()
        doc.add(
            Field('id', str(article['id'][0]), Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('title', article['url'], Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        doc.add(
            Field('content', article['text'], Field.Store.NO,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)

    print 'Optimization'
    writer.optimize()
    writer.close()
コード例 #3
0
    def indexDocs(self, root, writer):

        f = codecs.open('picIndex.txt','r',encoding='utf-8')
        picDict = {}
        for line in f.xreadlines():
            ls = line.split('seg^*')
            url = ls[0]
            title = ls[1] 
            src = ls[2]
            alt = ls[3]
            picDict[src] = [url,title,alt]
        f.close()
        for src in picDict:
            doc = Document()
            doc.add(Field("src", src,
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("url", picDict[src][0],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("title", picDict[src][1],
                                 Field.Store.YES,
                                 Field.Index.NOT_ANALYZED))
            doc.add(Field("alt", picDict[src][2],
                                 Field.Store.YES,
                                 Field.Index.ANALYZED))
            writer.addDocument(doc)
コード例 #4
0
    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(
                Field("keyword", word, Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word, Field.Store.YES, Field.Index.NO))
            doc.add(
                Field("unstored", word, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word, Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
コード例 #5
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        # Elwood
        document = Document()
        document.add(
            Field("owner", "elwood", Field.Store.YES,
                  Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "elwoods sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        # Jake
        document = Document()
        document.add(
            Field("owner", "jake", Field.Store.YES, Field.Index.NOT_ANALYZED))
        document.add(
            Field("keywords", "jakes sensitive info", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(document)

        writer.close()
コード例 #6
0
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO))
        doc.add(
            Field("contents", "St. Petersburg has lots of bridges",
                  Field.Store.NO, Field.Index.ANALYZED))
        doc.add(
            Field("city", "St. Petersburg", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.optimize()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
コード例 #7
0
    def setUp(self):

        self.directory = RAMDirectory()
        self.analyzer = WhitespaceAnalyzer()

        writer = IndexWriter(self.directory, self.analyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("f", "the quick brown fox jumps over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        doc = Document()
        doc.add(
            Field("f", "the quick red fox jumps over the sleepy cat",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
        self.reader = IndexReader.open(self.directory, True)

        self.quick = SpanTermQuery(Term("f", "quick"))
        self.brown = SpanTermQuery(Term("f", "brown"))
        self.red = SpanTermQuery(Term("f", "red"))
        self.fox = SpanTermQuery(Term("f", "fox"))
        self.lazy = SpanTermQuery(Term("f", "lazy"))
        self.sleepy = SpanTermQuery(Term("f", "sleepy"))
        self.dog = SpanTermQuery(Term("f", "dog"))
        self.cat = SpanTermQuery(Term("f", "cat"))
コード例 #8
0
ファイル: search.py プロジェクト: JeffAMcGee/crowdy
 def addCrowd(self, id, text):
     doc = Document()
     doc.add(
         Field(CrowdFields.id, id, Field.Store.YES,
               Field.Index.NOT_ANALYZED))
     doc.add(
         Field(CrowdFields.text, text, Field.Store.YES,
               Field.Index.ANALYZED))
     self.writer.updateDocument(Term(CrowdFields.id, id), doc)
コード例 #9
0
 def addDocuments(self, _id, title, content):
     doc = Document()
     doc.add(Field("id", _id, Field.Store.YES, Field.Index.NOT_ANALYZED))
     if title is not None and len(title) > 0:
         doc.add(
             Field("titleKeyword", title, Field.Store.NO,
                   Field.Index.ANALYZED))
     if content is not None and len(content) > 0:
         doc.add(
             Field("contentKeyword", content, Field.Store.NO,
                   Field.Index.ANALYZED))
     self.index_writer.addDocument(doc)
コード例 #10
0
    def addPoint(self, writer, name, type, x, y):

        doc = Document()
        doc.add(Field("name", name, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("type", type, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("x", str(x), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))
        doc.add(
            Field("y", str(y), Field.Store.YES,
                  Field.Index.NOT_ANALYZED_NO_NORMS))

        writer.addDocument(doc)
コード例 #11
0
ファイル: Indexer.py プロジェクト: BurnedRobot/SearchEngine
def _IndexStringField(doc, field_name, field_content):
    #print "This is StringField:",field_content
    if field_content is None:
        return
    else:
        if ("id" == field_name):
            doc.add(
                Field(field_name, str(field_content), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
        else:
            doc.add(
                Field(field_name, str(field_content), Field.Store.YES,
                      Field.Index.ANALYZED))
コード例 #12
0
    def indexReader(self, indexWriter, reader, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        reader = StringReader(reader.read())
        doc.add(Field("contents", reader, Field.TermVector.YES))

        indexWriter.addDocument(doc)
コード例 #13
0
ファイル: indexer.py プロジェクト: mefagan/relevancefeedback-
def createIndex():
    #initialize lucene and jvm
    print("started indexer")
    lucene.initVM()
    indexDir = "/Tmp/REMOVEME.index-dir"

    
    #get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    #get index storage
    dir = lucene.SimpleFSDirectory(lucene.File(indexDir))
   
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    src_dir = 'html_files'
    i = 0
    for l in os.listdir(src_dir):
        l = os.path.join(src_dir, l)
        with open(l, 'r') as myfile:
            data=myfile.read()
        i += 1
        document, errors = parsehtml(data)
        doc = Document()
        doc.add(Field("text", document, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    writer.optimize()
    writer.close()
コード例 #14
0
    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return

        docsInIndex = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
                                'index-dir')
        dir = FSDirectory.getDirectory(indexDir, True)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = System.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(
                Field("fieldname", "Bibamus", Field.Store.YES,
                      Field.Index.TOKENIZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)
コード例 #15
0
ファイル: app.py プロジェクト: ProjectLISM/GUI
def configure_lucene():

    f = open('clique.txt', 'r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs(
    )

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t', '')
        line = line.replace('\r', '')
        line = line.replace('\n', '')
        line = line.replace('^', '')
        line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (
        writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs(
    )
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs(
    )
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
コード例 #16
0
ファイル: pylucene_test.py プロジェクト: SamChen1981/spider-1
def luceneIndexer(contents):
    lucene.initVM()
    

    INDEXIDR= settings.INDEX_DIR

    indexdir= SimpleFSDirectory(File(INDEXIDR))
    
    analyzer= StandardAnalyzer(Version.LUCENE_30)

    index_writer= IndexWriter(indexdir,analyzer,True,\

    IndexWriter.MaxFieldLength(512))
    for tfile in contents:
        print"Indexing: ", tfile

        document= Document()

        content= tfile.getvalue()

        document.add(Field("text",content,Field.Store.YES,\
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
        print"Done: ", tfile
        index_writer.optimize()
        print index_writer.numDocs()
    index_writer.close()
コード例 #17
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(
            Field("partnum", "Q36", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(
            Field("description", "Illidium Space Modulator", Field.Store.YES,
                  Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

        self.searcher = IndexSearcher(self.directory, True)
コード例 #18
0
    def indexValue(self, indexWriter, value, uItem, uAttr, uValue, version):

        STORED = Field.Store.YES
        UN_STORED = Field.Store.NO
        TOKENIZED = Field.Index.TOKENIZED
        UN_INDEXED = Field.Index.NO
        UN_TOKENIZED = Field.Index.UN_TOKENIZED

        doc = Document()
        doc.add(Field("item", uItem.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("attribute", uAttr.str64(), STORED, UN_TOKENIZED))
        doc.add(Field("value", uValue.str64(), STORED, UN_INDEXED))
        doc.add(Field("version", str(version), STORED, UN_INDEXED))
        doc.add(
            Field("contents", value, UN_STORED, TOKENIZED,
                  Field.TermVector.YES))
        indexWriter.addDocument(doc)
コード例 #19
0
ファイル: ScoreTest.py プロジェクト: lauromoraes/pylucene
    def testFuzzy(self):

        self.indexSingleFieldDocs([Field("contents", "fuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "wuzzy", Field.Store.YES,
                                         Field.Index.ANALYZED)])

        searcher = IndexSearcher(self.directory)
        query = FuzzyQuery(Term("contents", "wuzza"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(2, len(scoreDocs), "both close enough")

        self.assert_(scoreDocs[0].score != scoreDocs[1].score,
                     "wuzzy closer than fuzzy")
        self.assertEqual("wuzzy",
                         searcher.doc(scoreDocs[0].doc).get("contents"),
                         "wuzza bear")
コード例 #20
0
    def indexFile(self, writer, path):

        try:
            file = open(path)
            string = HTMLReader(InputStreamReader(file, 'utf-8')).read()
            file.close()
        except:
            raise
        else:
            doc = Document()
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            return doc
コード例 #21
0
ファイル: T9er.py プロジェクト: lauromoraes/pylucene
    def main(cls, argv):

        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return

        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(
                Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("t9", cls.t9(word), Field.Store.YES,
                      Field.Index.UN_TOKENIZED))
            newDoc.add(
                Field("length", str(len(word)), Field.Store.NO,
                      Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.optimize()
        writer.close()

        reader.close()
コード例 #22
0
    def _addDoc(self, text, writer):
        """
		function to add documents in the lucene index. 
		text fields are indexed by the name "field"
		"""

        doc = Document()
        doc.add(Field("field", text, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
コード例 #23
0
    def setUp(self):

        directory = RAMDirectory()
        writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc1 = Document()
        doc1.add(
            Field("field", "the quick brown fox jumped over the lazy dog",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc1)

        doc2 = Document()
        doc2.add(
            Field("field", "the fast fox hopped over the hound",
                  Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc2)
        writer.close()

        self.searcher = IndexSearcher(directory, True)
コード例 #24
0
    def indexFile(self, writer, path):

        doc = Document()

        try:
            process = popen2.Popen4(["antiword", "-m", "UTF-8", path])
            string = InputStreamReader(process.fromchild, 'utf-8').read()
        except:
            raise
        else:
            doc.add(Field("contents", StringReader(string)))
            doc.add(Field("filename", os.path.abspath(path),
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdftotext exit code %d" %(exitCode)

            return doc
コード例 #25
0
    def addDocuments(self, dir, maxFieldLength):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength(maxFieldLength))
        
        for keyword, unindexed, unstored, text in \
                izip(self.keywords, self.unindexed, self.unstored, self.text):
            doc = Document()
            doc.add(Field("id", keyword,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", unindexed,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", unstored,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", text,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
コード例 #26
0
ファイル: ScoreTest.py プロジェクト: lauromoraes/pylucene
    def testWildcard(self):

        self.indexSingleFieldDocs([Field("contents", "wild", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "child", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "mild", Field.Store.YES,
                                         Field.Index.ANALYZED),
                                   Field("contents", "mildew", Field.Store.YES,
                                         Field.Index.ANALYZED)])

        searcher = IndexSearcher(self.directory)
        query = WildcardQuery(Term("contents", "?ild*"))
        scoreDocs = searcher.search(query, 50).scoreDocs
        self.assertEqual(3, len(scoreDocs), "child no match")

        self.assertEqual(scoreDocs[0].score, scoreDocs[1].score,
                         "score the same")
        self.assertEqual(scoreDocs[1].score, scoreDocs[1].score,
                         "score the same")
コード例 #27
0
    def index(self, doc, title, department, url):
        indexdir = SimpleFSDirectory(File(self.indir))
        analyzer = StandardAnalyzer(Version.LUCENE_30)
        index_writer = IndexWriter(indexdir, analyzer, self.init,
                                   IndexWriter.MaxFieldLength(512))
        self.init = False

        # Initialize document and index it
        document = Document()
        document.add(
            Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
        document.add(Field("url", url, Field.Store.YES, Field.Index.ANALYZED))
        document.add(
            Field("department", department, Field.Store.YES,
                  Field.Index.ANALYZED))
        document.add(Field("text", doc, Field.Store.YES, Field.Index.ANALYZED))
        index_writer.addDocument(document)

        index_writer.optimize()
        index_writer.close()
コード例 #28
0
def parse_file(file_path, writer):
    f = open(file_path, 'r')
    soup = BeautifulSoup(f.read())
    f.close()
    doc = Document()
    content_tags = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
    content = ""
    for tag in content_tags:
        matches = soup.find_all(tag)
        for match in matches:
            if match.string:
                content += match.string + " "
    afinn_score = afinn.sentiment(content)
    doc.add(Field("filepath", file_path, Field.Store.YES,
                  Field.Index.ANALYZED))
    doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(
        Field("sentiment", str(afinn_score), Field.Store.YES,
              Field.Index.ANALYZED))
    writer.addDocument(doc)
コード例 #29
0
    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.porterAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("contents",
                      "The quick brown fox jumps over the lazy dogs",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()
コード例 #30
0
ファイル: PDFHandler.py プロジェクト: lauromoraes/pylucene
    def indexFile(self, writer, path):

        doc = Document()

        try:
            process = popen2.Popen4(["pdfinfo", "-enc", "UTF-8", path])
        except:
            raise
        else:
            while True:
                line = process.fromchild.readline().strip()
                if not line:
                    break
                name, value = line.split(':', 1)
                doc.add(
                    Field(name.strip(), value.strip(), Field.Store.YES,
                          Field.Index.NOT_ANALYZED))

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdfinfo exit code %d" % (exitCode)

        try:
            process = popen2.Popen4(["pdftotext", "-enc", "UTF-8", path, "-"])
            string = InputStreamReader(process.fromchild, 'utf-8').read()
        except:
            raise
        else:
            doc.add(Field("contents", StringReader(string)))
            doc.add(
                Field("filename", os.path.abspath(path), Field.Store.YES,
                      Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

            exitCode = process.wait()
            if exitCode != 0:
                raise RuntimeError, "pdftotext exit code %d" % (exitCode)

            return doc